{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5309024194473111, "eval_steps": 500, "global_step": 4630, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 6.6514, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 6.7761, "step": 2 }, { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 6.6376, "step": 3 }, { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 6.9304, "step": 4 }, { "epoch": 0.0, "grad_norm": 33.39242836246673, "learning_rate": 3.816793893129771e-06, "loss": 6.98, "step": 5 }, { "epoch": 0.0, "grad_norm": 27.008682568448762, "learning_rate": 7.633587786259541e-06, "loss": 6.4733, "step": 6 }, { "epoch": 0.0, "grad_norm": 40.6397754614901, "learning_rate": 1.1450381679389314e-05, "loss": 6.7444, "step": 7 }, { "epoch": 0.0, "grad_norm": 40.6397754614901, "learning_rate": 1.1450381679389314e-05, "loss": 6.8197, "step": 8 }, { "epoch": 0.0, "grad_norm": 35.827533515372565, "learning_rate": 1.5267175572519083e-05, "loss": 6.8799, "step": 9 }, { "epoch": 0.0, "grad_norm": 33.8543071159309, "learning_rate": 1.9083969465648855e-05, "loss": 6.7145, "step": 10 }, { "epoch": 0.0, "grad_norm": 21.3516034007554, "learning_rate": 2.2900763358778628e-05, "loss": 6.3579, "step": 11 }, { "epoch": 0.0, "grad_norm": 70.28055311590448, "learning_rate": 2.6717557251908397e-05, "loss": 6.5739, "step": 12 }, { "epoch": 0.0, "grad_norm": 33.42339347618788, "learning_rate": 3.0534351145038166e-05, "loss": 6.4945, "step": 13 }, { "epoch": 0.0, "grad_norm": 26.928669960691515, "learning_rate": 3.435114503816794e-05, "loss": 6.3436, "step": 14 }, { "epoch": 0.0, "grad_norm": 18.970040777213786, "learning_rate": 3.816793893129771e-05, "loss": 6.273, "step": 15 }, { "epoch": 0.0, "grad_norm": 15.443071756758568, "learning_rate": 4.198473282442748e-05, "loss": 6.3071, "step": 16 }, { "epoch": 0.0, "grad_norm": 13.411145887257472, "learning_rate": 4.5801526717557256e-05, "loss": 6.3189, "step": 17 }, { "epoch": 0.0, "grad_norm": 18.541548606603897, "learning_rate": 4.9618320610687025e-05, "loss": 5.9221, "step": 18 }, { "epoch": 0.0, "grad_norm": 19.35655223746339, "learning_rate": 5.3435114503816794e-05, "loss": 6.0873, "step": 19 }, { "epoch": 0.0, "grad_norm": 9.670769552741916, "learning_rate": 5.725190839694656e-05, "loss": 5.9529, "step": 20 }, { "epoch": 0.0, "grad_norm": 15.209774689947576, "learning_rate": 6.106870229007633e-05, "loss": 5.9608, "step": 21 }, { "epoch": 0.0, "grad_norm": 27.79021469160418, "learning_rate": 6.488549618320611e-05, "loss": 5.8262, "step": 22 }, { "epoch": 0.0, "grad_norm": 8.600826473883481, "learning_rate": 6.870229007633588e-05, "loss": 5.8247, "step": 23 }, { "epoch": 0.0, "grad_norm": 18.54399604952589, "learning_rate": 7.251908396946565e-05, "loss": 5.5959, "step": 24 }, { "epoch": 0.0, "grad_norm": 16.426830529474223, "learning_rate": 7.633587786259542e-05, "loss": 5.9115, "step": 25 }, { "epoch": 0.0, "grad_norm": 15.633913299881293, "learning_rate": 8.015267175572518e-05, "loss": 5.8771, "step": 26 }, { "epoch": 0.0, "grad_norm": 11.410094465372426, "learning_rate": 8.396946564885496e-05, "loss": 5.7986, "step": 27 }, { "epoch": 0.0, "grad_norm": 6.817587936157467, "learning_rate": 8.778625954198472e-05, "loss": 5.6334, "step": 28 }, { "epoch": 0.0, "grad_norm": 8.510708964156741, "learning_rate": 9.160305343511451e-05, "loss": 5.8287, "step": 29 }, { "epoch": 0.0, "grad_norm": 10.158571202194247, "learning_rate": 9.541984732824429e-05, "loss": 5.7961, "step": 30 }, { "epoch": 0.0, "grad_norm": 7.842947814897398, "learning_rate": 9.923664122137405e-05, "loss": 5.306, "step": 31 }, { "epoch": 0.0, "grad_norm": 29.95386756616715, "learning_rate": 0.00010305343511450383, "loss": 5.3147, "step": 32 }, { "epoch": 0.0, "grad_norm": 29.95386756616715, "learning_rate": 0.00010305343511450383, "loss": 5.5563, "step": 33 }, { "epoch": 0.0, "grad_norm": 70.12390125349197, "learning_rate": 0.00010687022900763359, "loss": 5.7267, "step": 34 }, { "epoch": 0.0, "grad_norm": 17.157348893238563, "learning_rate": 0.00011068702290076336, "loss": 5.321, "step": 35 }, { "epoch": 0.0, "grad_norm": 241.29334778857753, "learning_rate": 0.00011450381679389313, "loss": 5.5991, "step": 36 }, { "epoch": 0.0, "grad_norm": 6.902079993753404, "learning_rate": 0.0001183206106870229, "loss": 5.4837, "step": 37 }, { "epoch": 0.0, "grad_norm": 17.796960161313766, "learning_rate": 0.00012213740458015266, "loss": 5.1122, "step": 38 }, { "epoch": 0.0, "grad_norm": 7.589528881724337, "learning_rate": 0.00012595419847328244, "loss": 5.2645, "step": 39 }, { "epoch": 0.0, "grad_norm": 5.959441201198923, "learning_rate": 0.00012977099236641222, "loss": 5.4282, "step": 40 }, { "epoch": 0.0, "grad_norm": 8.043404369119829, "learning_rate": 0.000133587786259542, "loss": 5.3091, "step": 41 }, { "epoch": 0.0, "grad_norm": 80.48575885998939, "learning_rate": 0.00013740458015267177, "loss": 5.4093, "step": 42 }, { "epoch": 0.0, "grad_norm": 13.742213731238829, "learning_rate": 0.00014122137404580154, "loss": 5.2129, "step": 43 }, { "epoch": 0.01, "grad_norm": 42.22169282692561, "learning_rate": 0.0001450381679389313, "loss": 5.211, "step": 44 }, { "epoch": 0.01, "grad_norm": 9.281606428765064, "learning_rate": 0.00014885496183206107, "loss": 5.4058, "step": 45 }, { "epoch": 0.01, "grad_norm": 41.010442640608986, "learning_rate": 0.00015267175572519084, "loss": 5.5207, "step": 46 }, { "epoch": 0.01, "grad_norm": 31.167750581302435, "learning_rate": 0.00015648854961832062, "loss": 5.4069, "step": 47 }, { "epoch": 0.01, "grad_norm": 21.867653948802253, "learning_rate": 0.00016030534351145037, "loss": 5.5624, "step": 48 }, { "epoch": 0.01, "grad_norm": 16.579039817183236, "learning_rate": 0.00016412213740458014, "loss": 5.2218, "step": 49 }, { "epoch": 0.01, "grad_norm": 41.86404616689693, "learning_rate": 0.00016793893129770992, "loss": 5.3396, "step": 50 }, { "epoch": 0.01, "grad_norm": 13.467500696816897, "learning_rate": 0.0001717557251908397, "loss": 5.5782, "step": 51 }, { "epoch": 0.01, "grad_norm": 16.39797654331649, "learning_rate": 0.00017557251908396944, "loss": 5.5483, "step": 52 }, { "epoch": 0.01, "grad_norm": 10.844720567867606, "learning_rate": 0.00017938931297709925, "loss": 5.2845, "step": 53 }, { "epoch": 0.01, "grad_norm": 10.7951422277577, "learning_rate": 0.00018320610687022902, "loss": 5.3984, "step": 54 }, { "epoch": 0.01, "grad_norm": 17.400732347691818, "learning_rate": 0.0001870229007633588, "loss": 5.21, "step": 55 }, { "epoch": 0.01, "grad_norm": 17.281499173536034, "learning_rate": 0.00019083969465648857, "loss": 5.266, "step": 56 }, { "epoch": 0.01, "grad_norm": 10.43134360259427, "learning_rate": 0.00019465648854961832, "loss": 5.2933, "step": 57 }, { "epoch": 0.01, "grad_norm": 96.44176322672297, "learning_rate": 0.0001984732824427481, "loss": 5.3701, "step": 58 }, { "epoch": 0.01, "grad_norm": 16.938450089540847, "learning_rate": 0.00020229007633587788, "loss": 4.9145, "step": 59 }, { "epoch": 0.01, "grad_norm": 14.942223211816597, "learning_rate": 0.00020610687022900765, "loss": 5.4219, "step": 60 }, { "epoch": 0.01, "grad_norm": 9.168541739617577, "learning_rate": 0.0002099236641221374, "loss": 5.4235, "step": 61 }, { "epoch": 0.01, "grad_norm": 11.610090652762562, "learning_rate": 0.00021374045801526718, "loss": 5.0843, "step": 62 }, { "epoch": 0.01, "grad_norm": 32.319478966201956, "learning_rate": 0.00021755725190839695, "loss": 4.9946, "step": 63 }, { "epoch": 0.01, "grad_norm": 16.94880320211913, "learning_rate": 0.00022137404580152673, "loss": 5.1152, "step": 64 }, { "epoch": 0.01, "grad_norm": 10.102931443308561, "learning_rate": 0.00022519083969465648, "loss": 4.7068, "step": 65 }, { "epoch": 0.01, "grad_norm": 7.74487021687703, "learning_rate": 0.00022900763358778625, "loss": 5.1637, "step": 66 }, { "epoch": 0.01, "grad_norm": 8.382587222584608, "learning_rate": 0.00023282442748091603, "loss": 4.9755, "step": 67 }, { "epoch": 0.01, "grad_norm": 10.996853398017768, "learning_rate": 0.0002366412213740458, "loss": 5.0931, "step": 68 }, { "epoch": 0.01, "grad_norm": 9.29270109874662, "learning_rate": 0.00024045801526717558, "loss": 5.091, "step": 69 }, { "epoch": 0.01, "grad_norm": 19.364669334853133, "learning_rate": 0.00024427480916030533, "loss": 4.8655, "step": 70 }, { "epoch": 0.01, "grad_norm": 33.720777329456745, "learning_rate": 0.00024809160305343513, "loss": 4.9781, "step": 71 }, { "epoch": 0.01, "grad_norm": 5.6159280095130395, "learning_rate": 0.0002519083969465649, "loss": 4.9571, "step": 72 }, { "epoch": 0.01, "grad_norm": 12.59599515871125, "learning_rate": 0.00025572519083969463, "loss": 4.6909, "step": 73 }, { "epoch": 0.01, "grad_norm": 11.093999472229571, "learning_rate": 0.00025954198473282443, "loss": 4.9774, "step": 74 }, { "epoch": 0.01, "grad_norm": 5.133620453764871, "learning_rate": 0.0002633587786259542, "loss": 4.8828, "step": 75 }, { "epoch": 0.01, "grad_norm": 6.196673371614883, "learning_rate": 0.000267175572519084, "loss": 5.059, "step": 76 }, { "epoch": 0.01, "grad_norm": 4.464088543678758, "learning_rate": 0.00027099236641221373, "loss": 5.3289, "step": 77 }, { "epoch": 0.01, "grad_norm": 6.088277605037016, "learning_rate": 0.00027480916030534353, "loss": 5.0887, "step": 78 }, { "epoch": 0.01, "grad_norm": 127.68908219066172, "learning_rate": 0.0002786259541984733, "loss": 4.9338, "step": 79 }, { "epoch": 0.01, "grad_norm": 6.786355635469429, "learning_rate": 0.0002824427480916031, "loss": 4.8178, "step": 80 }, { "epoch": 0.01, "grad_norm": 8.09008916094108, "learning_rate": 0.0002862595419847328, "loss": 4.989, "step": 81 }, { "epoch": 0.01, "grad_norm": 7.363692738916807, "learning_rate": 0.0002900763358778626, "loss": 4.916, "step": 82 }, { "epoch": 0.01, "grad_norm": 8.343738622170674, "learning_rate": 0.0002938931297709924, "loss": 4.8986, "step": 83 }, { "epoch": 0.01, "grad_norm": 3.841080176260726, "learning_rate": 0.00029770992366412214, "loss": 4.9701, "step": 84 }, { "epoch": 0.01, "grad_norm": 9.774889285110454, "learning_rate": 0.00030152671755725194, "loss": 4.9086, "step": 85 }, { "epoch": 0.01, "grad_norm": 12.687347604823433, "learning_rate": 0.0003053435114503817, "loss": 4.6547, "step": 86 }, { "epoch": 0.01, "grad_norm": 4.9833100474327745, "learning_rate": 0.0003091603053435115, "loss": 4.9316, "step": 87 }, { "epoch": 0.01, "grad_norm": 19.077058746375922, "learning_rate": 0.00031297709923664124, "loss": 4.7011, "step": 88 }, { "epoch": 0.01, "grad_norm": 12.346410737789308, "learning_rate": 0.000316793893129771, "loss": 4.444, "step": 89 }, { "epoch": 0.01, "grad_norm": 4.054152865735593, "learning_rate": 0.00032061068702290074, "loss": 4.7164, "step": 90 }, { "epoch": 0.01, "grad_norm": 3.9069136259961748, "learning_rate": 0.00032442748091603054, "loss": 4.811, "step": 91 }, { "epoch": 0.01, "grad_norm": 4.581566131472375, "learning_rate": 0.0003282442748091603, "loss": 4.8878, "step": 92 }, { "epoch": 0.01, "grad_norm": 3.8349581527195733, "learning_rate": 0.0003320610687022901, "loss": 4.7922, "step": 93 }, { "epoch": 0.01, "grad_norm": 33.207733069046185, "learning_rate": 0.00033587786259541984, "loss": 4.7337, "step": 94 }, { "epoch": 0.01, "grad_norm": 6.647848196356661, "learning_rate": 0.00033969465648854964, "loss": 4.8087, "step": 95 }, { "epoch": 0.01, "grad_norm": 7.626992907131409, "learning_rate": 0.0003435114503816794, "loss": 4.823, "step": 96 }, { "epoch": 0.01, "grad_norm": 46.1397480756138, "learning_rate": 0.0003473282442748092, "loss": 4.6861, "step": 97 }, { "epoch": 0.01, "grad_norm": 11.202005600698312, "learning_rate": 0.0003511450381679389, "loss": 4.6784, "step": 98 }, { "epoch": 0.01, "grad_norm": 3.4382925983128563, "learning_rate": 0.0003549618320610687, "loss": 4.8087, "step": 99 }, { "epoch": 0.01, "grad_norm": 4.077030999013568, "learning_rate": 0.0003587786259541985, "loss": 4.5712, "step": 100 }, { "epoch": 0.01, "grad_norm": 5.733373967161133, "learning_rate": 0.00036259541984732824, "loss": 4.683, "step": 101 }, { "epoch": 0.01, "grad_norm": 4.4809021147253505, "learning_rate": 0.00036641221374045805, "loss": 4.7095, "step": 102 }, { "epoch": 0.01, "grad_norm": 8.049152649990367, "learning_rate": 0.0003702290076335878, "loss": 4.6054, "step": 103 }, { "epoch": 0.01, "grad_norm": 3.8438028188338227, "learning_rate": 0.0003740458015267176, "loss": 4.4923, "step": 104 }, { "epoch": 0.01, "grad_norm": 24.316393810042157, "learning_rate": 0.00037786259541984735, "loss": 4.7532, "step": 105 }, { "epoch": 0.01, "grad_norm": 3.51575232802249, "learning_rate": 0.00038167938931297715, "loss": 4.6174, "step": 106 }, { "epoch": 0.01, "grad_norm": 7.274919523295039, "learning_rate": 0.00038549618320610684, "loss": 4.789, "step": 107 }, { "epoch": 0.01, "grad_norm": 17.663585155755065, "learning_rate": 0.00038931297709923665, "loss": 4.5366, "step": 108 }, { "epoch": 0.01, "grad_norm": 55.463317792785325, "learning_rate": 0.0003931297709923664, "loss": 4.7898, "step": 109 }, { "epoch": 0.01, "grad_norm": 69.62582697257729, "learning_rate": 0.0003969465648854962, "loss": 4.8579, "step": 110 }, { "epoch": 0.01, "grad_norm": 14.068727648347044, "learning_rate": 0.00040076335877862595, "loss": 4.596, "step": 111 }, { "epoch": 0.01, "grad_norm": 7.723458596607843, "learning_rate": 0.00040458015267175575, "loss": 4.633, "step": 112 }, { "epoch": 0.01, "grad_norm": 40.36848101260133, "learning_rate": 0.0004083969465648855, "loss": 4.7708, "step": 113 }, { "epoch": 0.01, "grad_norm": 27.882246192846715, "learning_rate": 0.0004122137404580153, "loss": 4.8659, "step": 114 }, { "epoch": 0.01, "grad_norm": 31.58337302466099, "learning_rate": 0.00041603053435114505, "loss": 4.7711, "step": 115 }, { "epoch": 0.01, "grad_norm": 11.684862321791522, "learning_rate": 0.0004198473282442748, "loss": 4.7042, "step": 116 }, { "epoch": 0.01, "grad_norm": 14.833220649982183, "learning_rate": 0.00042366412213740455, "loss": 4.9598, "step": 117 }, { "epoch": 0.01, "grad_norm": 12.402942550832153, "learning_rate": 0.00042748091603053435, "loss": 4.774, "step": 118 }, { "epoch": 0.01, "grad_norm": 7.781201337976804, "learning_rate": 0.00043129770992366415, "loss": 4.5104, "step": 119 }, { "epoch": 0.01, "grad_norm": 11.146758634349396, "learning_rate": 0.0004351145038167939, "loss": 4.5192, "step": 120 }, { "epoch": 0.01, "grad_norm": 12.677641788065019, "learning_rate": 0.0004389312977099237, "loss": 5.008, "step": 121 }, { "epoch": 0.01, "grad_norm": 5.854393144496698, "learning_rate": 0.00044274809160305345, "loss": 4.647, "step": 122 }, { "epoch": 0.01, "grad_norm": 8.854211722222779, "learning_rate": 0.00044656488549618326, "loss": 4.4995, "step": 123 }, { "epoch": 0.01, "grad_norm": 12.62479724368964, "learning_rate": 0.00045038167938931295, "loss": 4.7229, "step": 124 }, { "epoch": 0.01, "grad_norm": 5.62799536282657, "learning_rate": 0.00045419847328244275, "loss": 4.768, "step": 125 }, { "epoch": 0.01, "grad_norm": 5.51587430514037, "learning_rate": 0.0004580152671755725, "loss": 4.7339, "step": 126 }, { "epoch": 0.01, "grad_norm": 12.688383416225603, "learning_rate": 0.0004618320610687023, "loss": 4.6273, "step": 127 }, { "epoch": 0.01, "grad_norm": 7.134508473885959, "learning_rate": 0.00046564885496183206, "loss": 4.5568, "step": 128 }, { "epoch": 0.01, "grad_norm": 8.877737399803616, "learning_rate": 0.00046946564885496186, "loss": 4.85, "step": 129 }, { "epoch": 0.01, "grad_norm": 6.587326088108083, "learning_rate": 0.0004732824427480916, "loss": 4.6828, "step": 130 }, { "epoch": 0.02, "grad_norm": 5.799646336850364, "learning_rate": 0.0004770992366412214, "loss": 4.6904, "step": 131 }, { "epoch": 0.02, "grad_norm": 16.289388145774485, "learning_rate": 0.00048091603053435116, "loss": 4.774, "step": 132 }, { "epoch": 0.02, "grad_norm": 4.062719966100514, "learning_rate": 0.0004847328244274809, "loss": 4.5432, "step": 133 }, { "epoch": 0.02, "grad_norm": 9.671114739870742, "learning_rate": 0.0004885496183206107, "loss": 4.4836, "step": 134 }, { "epoch": 0.02, "grad_norm": 7.571931373368516, "learning_rate": 0.0004923664122137404, "loss": 4.8137, "step": 135 }, { "epoch": 0.02, "grad_norm": 5.689945287746852, "learning_rate": 0.0004961832061068703, "loss": 4.7838, "step": 136 }, { "epoch": 0.02, "grad_norm": 4.204208017185456, "learning_rate": 0.0005, "loss": 4.4835, "step": 137 }, { "epoch": 0.02, "grad_norm": 10.96590601380642, "learning_rate": 0.0005038167938931298, "loss": 4.8823, "step": 138 }, { "epoch": 0.02, "grad_norm": 6.339424602527296, "learning_rate": 0.0005076335877862596, "loss": 4.4707, "step": 139 }, { "epoch": 0.02, "grad_norm": 5.580852728442506, "learning_rate": 0.0005114503816793893, "loss": 4.6375, "step": 140 }, { "epoch": 0.02, "grad_norm": 6.421095753332356, "learning_rate": 0.0005152671755725191, "loss": 4.3955, "step": 141 }, { "epoch": 0.02, "grad_norm": 8.714303537346883, "learning_rate": 0.0005190839694656489, "loss": 4.5009, "step": 142 }, { "epoch": 0.02, "grad_norm": 5.339088565335364, "learning_rate": 0.0005229007633587787, "loss": 4.5163, "step": 143 }, { "epoch": 0.02, "grad_norm": 10.049133898730092, "learning_rate": 0.0005267175572519084, "loss": 4.4541, "step": 144 }, { "epoch": 0.02, "grad_norm": 6.769322799336011, "learning_rate": 0.0005305343511450382, "loss": 4.5766, "step": 145 }, { "epoch": 0.02, "grad_norm": 4.515303661964638, "learning_rate": 0.000534351145038168, "loss": 4.6578, "step": 146 }, { "epoch": 0.02, "grad_norm": 14.095522378611038, "learning_rate": 0.0005381679389312977, "loss": 4.4995, "step": 147 }, { "epoch": 0.02, "grad_norm": 4.203145382303139, "learning_rate": 0.0005419847328244275, "loss": 4.6626, "step": 148 }, { "epoch": 0.02, "grad_norm": 8.066785408747641, "learning_rate": 0.0005458015267175572, "loss": 4.6363, "step": 149 }, { "epoch": 0.02, "grad_norm": 9.828904178357, "learning_rate": 0.0005496183206106871, "loss": 4.6747, "step": 150 }, { "epoch": 0.02, "grad_norm": 3.054577199866886, "learning_rate": 0.0005534351145038168, "loss": 4.2082, "step": 151 }, { "epoch": 0.02, "grad_norm": 5.972821403717792, "learning_rate": 0.0005572519083969466, "loss": 4.5258, "step": 152 }, { "epoch": 0.02, "grad_norm": 9.834062698289323, "learning_rate": 0.0005610687022900763, "loss": 4.5866, "step": 153 }, { "epoch": 0.02, "grad_norm": 4.878115038212248, "learning_rate": 0.0005648854961832062, "loss": 4.3865, "step": 154 }, { "epoch": 0.02, "grad_norm": 7.906033622993491, "learning_rate": 0.0005687022900763359, "loss": 4.6188, "step": 155 }, { "epoch": 0.02, "grad_norm": 11.350443978166005, "learning_rate": 0.0005725190839694656, "loss": 4.6495, "step": 156 }, { "epoch": 0.02, "grad_norm": 3.725292172263509, "learning_rate": 0.0005763358778625954, "loss": 4.4811, "step": 157 }, { "epoch": 0.02, "grad_norm": 4.881180693187068, "learning_rate": 0.0005801526717557252, "loss": 4.6711, "step": 158 }, { "epoch": 0.02, "grad_norm": 4.962413809276911, "learning_rate": 0.000583969465648855, "loss": 4.4648, "step": 159 }, { "epoch": 0.02, "grad_norm": 10.235220371326772, "learning_rate": 0.0005877862595419848, "loss": 4.5727, "step": 160 }, { "epoch": 0.02, "grad_norm": 8.820349424210823, "learning_rate": 0.0005916030534351145, "loss": 4.4386, "step": 161 }, { "epoch": 0.02, "grad_norm": 2.3861226289417425, "learning_rate": 0.0005954198473282443, "loss": 4.3023, "step": 162 }, { "epoch": 0.02, "grad_norm": 2.829996276076901, "learning_rate": 0.0005992366412213741, "loss": 4.4348, "step": 163 }, { "epoch": 0.02, "grad_norm": 4.567866999086078, "learning_rate": 0.0006030534351145039, "loss": 4.6014, "step": 164 }, { "epoch": 0.02, "grad_norm": 4.109882129633001, "learning_rate": 0.0006068702290076335, "loss": 4.378, "step": 165 }, { "epoch": 0.02, "grad_norm": 3.0471820285953224, "learning_rate": 0.0006106870229007634, "loss": 4.4039, "step": 166 }, { "epoch": 0.02, "grad_norm": 2.8997271215545357, "learning_rate": 0.0006145038167938931, "loss": 4.4568, "step": 167 }, { "epoch": 0.02, "grad_norm": 5.003835180533562, "learning_rate": 0.000618320610687023, "loss": 4.5593, "step": 168 }, { "epoch": 0.02, "grad_norm": 4.4249898499210225, "learning_rate": 0.0006221374045801526, "loss": 4.5102, "step": 169 }, { "epoch": 0.02, "grad_norm": 3.42667634735796, "learning_rate": 0.0006259541984732825, "loss": 4.4448, "step": 170 }, { "epoch": 0.02, "grad_norm": 3.1805160049917998, "learning_rate": 0.0006297709923664122, "loss": 4.5395, "step": 171 }, { "epoch": 0.02, "grad_norm": 4.839914129485522, "learning_rate": 0.000633587786259542, "loss": 4.5604, "step": 172 }, { "epoch": 0.02, "grad_norm": 3.408883824364028, "learning_rate": 0.0006374045801526717, "loss": 4.5022, "step": 173 }, { "epoch": 0.02, "grad_norm": 2.299551349387421, "learning_rate": 0.0006412213740458015, "loss": 4.2675, "step": 174 }, { "epoch": 0.02, "grad_norm": 2.827826989519601, "learning_rate": 0.0006450381679389313, "loss": 4.6388, "step": 175 }, { "epoch": 0.02, "grad_norm": 4.838274469042938, "learning_rate": 0.0006488549618320611, "loss": 4.2013, "step": 176 }, { "epoch": 0.02, "grad_norm": 4.701545778360774, "learning_rate": 0.0006526717557251909, "loss": 4.198, "step": 177 }, { "epoch": 0.02, "grad_norm": 3.7396269470245014, "learning_rate": 0.0006564885496183206, "loss": 4.5289, "step": 178 }, { "epoch": 0.02, "grad_norm": 3.4423815155048128, "learning_rate": 0.0006603053435114504, "loss": 4.5216, "step": 179 }, { "epoch": 0.02, "grad_norm": 2.9197059073626574, "learning_rate": 0.0006641221374045802, "loss": 4.3428, "step": 180 }, { "epoch": 0.02, "grad_norm": 2.769911744605693, "learning_rate": 0.0006679389312977099, "loss": 4.2843, "step": 181 }, { "epoch": 0.02, "grad_norm": 2.690682849058132, "learning_rate": 0.0006717557251908397, "loss": 4.4419, "step": 182 }, { "epoch": 0.02, "grad_norm": 1.9424212986632443, "learning_rate": 0.0006755725190839694, "loss": 4.3486, "step": 183 }, { "epoch": 0.02, "grad_norm": 4.23739191988172, "learning_rate": 0.0006793893129770993, "loss": 4.7393, "step": 184 }, { "epoch": 0.02, "grad_norm": 3.1332515937914214, "learning_rate": 0.000683206106870229, "loss": 4.1623, "step": 185 }, { "epoch": 0.02, "grad_norm": 6.231515178578274, "learning_rate": 0.0006870229007633588, "loss": 4.2267, "step": 186 }, { "epoch": 0.02, "grad_norm": 2.6594621184737854, "learning_rate": 0.0006908396946564885, "loss": 4.6288, "step": 187 }, { "epoch": 0.02, "grad_norm": 4.172904422312939, "learning_rate": 0.0006946564885496184, "loss": 4.5263, "step": 188 }, { "epoch": 0.02, "grad_norm": 3.7173716873826677, "learning_rate": 0.0006984732824427481, "loss": 4.4568, "step": 189 }, { "epoch": 0.02, "grad_norm": 16.872230900932873, "learning_rate": 0.0007022900763358778, "loss": 4.424, "step": 190 }, { "epoch": 0.02, "grad_norm": 4.350962747873534, "learning_rate": 0.0007061068702290076, "loss": 4.367, "step": 191 }, { "epoch": 0.02, "grad_norm": 8.601264483119966, "learning_rate": 0.0007099236641221374, "loss": 4.3571, "step": 192 }, { "epoch": 0.02, "grad_norm": 2.1843421804869734, "learning_rate": 0.0007137404580152672, "loss": 4.2473, "step": 193 }, { "epoch": 0.02, "grad_norm": 2.968397695386696, "learning_rate": 0.000717557251908397, "loss": 4.606, "step": 194 }, { "epoch": 0.02, "grad_norm": 2.986718755959671, "learning_rate": 0.0007213740458015267, "loss": 4.6034, "step": 195 }, { "epoch": 0.02, "grad_norm": 2.9287337532257833, "learning_rate": 0.0007251908396946565, "loss": 4.602, "step": 196 }, { "epoch": 0.02, "grad_norm": 4.773459635371496, "learning_rate": 0.0007290076335877863, "loss": 4.2743, "step": 197 }, { "epoch": 0.02, "grad_norm": 2.9164644288219783, "learning_rate": 0.0007328244274809161, "loss": 4.4361, "step": 198 }, { "epoch": 0.02, "grad_norm": 5.779159541538482, "learning_rate": 0.0007366412213740457, "loss": 4.5472, "step": 199 }, { "epoch": 0.02, "grad_norm": 6.0916349893994335, "learning_rate": 0.0007404580152671756, "loss": 4.5428, "step": 200 }, { "epoch": 0.02, "grad_norm": 2.389716197130383, "learning_rate": 0.0007442748091603053, "loss": 4.3915, "step": 201 }, { "epoch": 0.02, "grad_norm": 17.78368287135694, "learning_rate": 0.0007480916030534352, "loss": 4.4451, "step": 202 }, { "epoch": 0.02, "grad_norm": 13.327524369133535, "learning_rate": 0.0007519083969465648, "loss": 4.4727, "step": 203 }, { "epoch": 0.02, "grad_norm": 5.099656932685892, "learning_rate": 0.0007557251908396947, "loss": 4.3978, "step": 204 }, { "epoch": 0.02, "grad_norm": 3.6996822665342752, "learning_rate": 0.0007595419847328244, "loss": 4.6511, "step": 205 }, { "epoch": 0.02, "grad_norm": 8.296610970871065, "learning_rate": 0.0007633587786259543, "loss": 4.6469, "step": 206 }, { "epoch": 0.02, "grad_norm": 3.4208667772212697, "learning_rate": 0.0007671755725190839, "loss": 4.6062, "step": 207 }, { "epoch": 0.02, "grad_norm": 3.4201300804499892, "learning_rate": 0.0007709923664122137, "loss": 4.4263, "step": 208 }, { "epoch": 0.02, "grad_norm": 1.9861289727375815, "learning_rate": 0.0007748091603053435, "loss": 4.516, "step": 209 }, { "epoch": 0.02, "grad_norm": 2.88462750299719, "learning_rate": 0.0007786259541984733, "loss": 4.4868, "step": 210 }, { "epoch": 0.02, "grad_norm": 2.5817566645114822, "learning_rate": 0.000782442748091603, "loss": 4.582, "step": 211 }, { "epoch": 0.02, "grad_norm": 3.228321968615345, "learning_rate": 0.0007862595419847328, "loss": 4.6532, "step": 212 }, { "epoch": 0.02, "grad_norm": 2.167668815466419, "learning_rate": 0.0007900763358778626, "loss": 4.4709, "step": 213 }, { "epoch": 0.02, "grad_norm": 4.580577107941309, "learning_rate": 0.0007938931297709924, "loss": 4.5146, "step": 214 }, { "epoch": 0.02, "grad_norm": 3.5443728989974885, "learning_rate": 0.0007977099236641223, "loss": 4.553, "step": 215 }, { "epoch": 0.02, "grad_norm": 2.0299932135637833, "learning_rate": 0.0008015267175572519, "loss": 4.185, "step": 216 }, { "epoch": 0.02, "grad_norm": 2.8272423773022726, "learning_rate": 0.0008053435114503816, "loss": 4.5649, "step": 217 }, { "epoch": 0.02, "grad_norm": 2.416432185490534, "learning_rate": 0.0008091603053435115, "loss": 4.3797, "step": 218 }, { "epoch": 0.03, "grad_norm": 2.2976442785472453, "learning_rate": 0.0008129770992366412, "loss": 4.346, "step": 219 }, { "epoch": 0.03, "grad_norm": 2.77829602737746, "learning_rate": 0.000816793893129771, "loss": 4.5063, "step": 220 }, { "epoch": 0.03, "grad_norm": 2.1986970498571137, "learning_rate": 0.0008206106870229007, "loss": 4.3934, "step": 221 }, { "epoch": 0.03, "grad_norm": 2.6401076937384564, "learning_rate": 0.0008244274809160306, "loss": 4.6939, "step": 222 }, { "epoch": 0.03, "grad_norm": 2.5636907151683013, "learning_rate": 0.0008282442748091604, "loss": 4.3629, "step": 223 }, { "epoch": 0.03, "grad_norm": 3.542478498117355, "learning_rate": 0.0008320610687022901, "loss": 4.6202, "step": 224 }, { "epoch": 0.03, "grad_norm": 4.746233613488993, "learning_rate": 0.0008358778625954198, "loss": 4.4122, "step": 225 }, { "epoch": 0.03, "grad_norm": 1.8708251452310167, "learning_rate": 0.0008396946564885496, "loss": 4.4217, "step": 226 }, { "epoch": 0.03, "grad_norm": 2.444968113480139, "learning_rate": 0.0008435114503816795, "loss": 4.5393, "step": 227 }, { "epoch": 0.03, "grad_norm": 3.2450594759081235, "learning_rate": 0.0008473282442748091, "loss": 4.4331, "step": 228 }, { "epoch": 0.03, "grad_norm": 3.9391447146402, "learning_rate": 0.000851145038167939, "loss": 4.4888, "step": 229 }, { "epoch": 0.03, "grad_norm": 2.4170658168574826, "learning_rate": 0.0008549618320610687, "loss": 4.5963, "step": 230 }, { "epoch": 0.03, "grad_norm": 6.412037612420958, "learning_rate": 0.0008587786259541986, "loss": 4.3901, "step": 231 }, { "epoch": 0.03, "grad_norm": 2.3216306037870966, "learning_rate": 0.0008625954198473283, "loss": 4.6436, "step": 232 }, { "epoch": 0.03, "grad_norm": 2.204695738511768, "learning_rate": 0.0008664122137404581, "loss": 4.3861, "step": 233 }, { "epoch": 0.03, "grad_norm": 2.5881192638075388, "learning_rate": 0.0008702290076335878, "loss": 4.3157, "step": 234 }, { "epoch": 0.03, "grad_norm": 2.707627414135801, "learning_rate": 0.0008740458015267176, "loss": 4.4016, "step": 235 }, { "epoch": 0.03, "grad_norm": 5.7561786608264915, "learning_rate": 0.0008778625954198474, "loss": 4.6088, "step": 236 }, { "epoch": 0.03, "grad_norm": 2.197011465188467, "learning_rate": 0.000881679389312977, "loss": 4.2826, "step": 237 }, { "epoch": 0.03, "grad_norm": 2.2856055746024158, "learning_rate": 0.0008854961832061069, "loss": 4.4652, "step": 238 }, { "epoch": 0.03, "grad_norm": 2.5218063127606363, "learning_rate": 0.0008893129770992367, "loss": 4.3299, "step": 239 }, { "epoch": 0.03, "grad_norm": 3.293537376053151, "learning_rate": 0.0008931297709923665, "loss": 4.3963, "step": 240 }, { "epoch": 0.03, "grad_norm": 4.613118043041774, "learning_rate": 0.0008969465648854962, "loss": 4.1824, "step": 241 }, { "epoch": 0.03, "grad_norm": 3.1274081823656537, "learning_rate": 0.0009007633587786259, "loss": 4.3763, "step": 242 }, { "epoch": 0.03, "grad_norm": 2.1913908195045604, "learning_rate": 0.0009045801526717558, "loss": 4.2244, "step": 243 }, { "epoch": 0.03, "grad_norm": 8.735246290525192, "learning_rate": 0.0009083969465648855, "loss": 4.3672, "step": 244 }, { "epoch": 0.03, "grad_norm": 1.7796772742031433, "learning_rate": 0.0009122137404580153, "loss": 4.3862, "step": 245 }, { "epoch": 0.03, "grad_norm": 3.0758222103486177, "learning_rate": 0.000916030534351145, "loss": 4.4179, "step": 246 }, { "epoch": 0.03, "grad_norm": 2.0536182655186774, "learning_rate": 0.0009198473282442749, "loss": 4.3517, "step": 247 }, { "epoch": 0.03, "grad_norm": 3.2758282364430014, "learning_rate": 0.0009236641221374046, "loss": 4.3807, "step": 248 }, { "epoch": 0.03, "grad_norm": 1.843132045826723, "learning_rate": 0.0009274809160305345, "loss": 4.4049, "step": 249 }, { "epoch": 0.03, "grad_norm": 1.910805853533411, "learning_rate": 0.0009312977099236641, "loss": 4.3877, "step": 250 }, { "epoch": 0.03, "grad_norm": 3.19510209058544, "learning_rate": 0.0009351145038167939, "loss": 4.4845, "step": 251 }, { "epoch": 0.03, "grad_norm": 4.0749342020877695, "learning_rate": 0.0009389312977099237, "loss": 4.195, "step": 252 }, { "epoch": 0.03, "grad_norm": 2.216217525993704, "learning_rate": 0.0009427480916030535, "loss": 4.4828, "step": 253 }, { "epoch": 0.03, "grad_norm": 2.1383094479653644, "learning_rate": 0.0009465648854961832, "loss": 4.4005, "step": 254 }, { "epoch": 0.03, "grad_norm": 2.1101597930385565, "learning_rate": 0.000950381679389313, "loss": 4.5747, "step": 255 }, { "epoch": 0.03, "grad_norm": 2.1915254968495264, "learning_rate": 0.0009541984732824428, "loss": 4.6898, "step": 256 }, { "epoch": 0.03, "grad_norm": 2.4044266305415625, "learning_rate": 0.0009580152671755726, "loss": 4.1006, "step": 257 }, { "epoch": 0.03, "grad_norm": 1.7836059353283142, "learning_rate": 0.0009618320610687023, "loss": 4.4565, "step": 258 }, { "epoch": 0.03, "grad_norm": 2.7771059560387372, "learning_rate": 0.0009656488549618321, "loss": 4.3205, "step": 259 }, { "epoch": 0.03, "grad_norm": 2.5617094847407436, "learning_rate": 0.0009694656488549618, "loss": 4.1082, "step": 260 }, { "epoch": 0.03, "grad_norm": 2.151624867377891, "learning_rate": 0.0009732824427480917, "loss": 4.5197, "step": 261 }, { "epoch": 0.03, "grad_norm": 2.315863073425737, "learning_rate": 0.0009770992366412213, "loss": 4.3367, "step": 262 }, { "epoch": 0.03, "grad_norm": 1.6839986367475495, "learning_rate": 0.0009809160305343512, "loss": 4.4433, "step": 263 }, { "epoch": 0.03, "grad_norm": 2.8906104935615082, "learning_rate": 0.0009847328244274808, "loss": 4.51, "step": 264 }, { "epoch": 0.03, "grad_norm": 2.437146080210409, "learning_rate": 0.0009885496183206107, "loss": 4.4271, "step": 265 }, { "epoch": 0.03, "grad_norm": 2.05272445089857, "learning_rate": 0.0009923664122137405, "loss": 4.3973, "step": 266 }, { "epoch": 0.03, "grad_norm": 5.993286089251946, "learning_rate": 0.0009961832061068704, "loss": 4.1809, "step": 267 }, { "epoch": 0.03, "grad_norm": 2.0731305133812183, "learning_rate": 0.001, "loss": 4.4188, "step": 268 }, { "epoch": 0.03, "grad_norm": 2.0131362376117545, "learning_rate": 0.0009999999655172654, "loss": 4.1585, "step": 269 }, { "epoch": 0.03, "grad_norm": 2.83819392543949, "learning_rate": 0.0009999998620690664, "loss": 4.4099, "step": 270 }, { "epoch": 0.03, "grad_norm": 8.672640493422819, "learning_rate": 0.0009999996896554175, "loss": 4.1685, "step": 271 }, { "epoch": 0.03, "grad_norm": 3.789947491857123, "learning_rate": 0.0009999994482763422, "loss": 4.2925, "step": 272 }, { "epoch": 0.03, "grad_norm": 3.5579574922920996, "learning_rate": 0.0009999991379318737, "loss": 4.3231, "step": 273 }, { "epoch": 0.03, "grad_norm": 3.115711269762882, "learning_rate": 0.000999998758622055, "loss": 4.386, "step": 274 }, { "epoch": 0.03, "grad_norm": 4.346000979175537, "learning_rate": 0.0009999983103469385, "loss": 4.4603, "step": 275 }, { "epoch": 0.03, "grad_norm": 3.0115417453550166, "learning_rate": 0.0009999977931065857, "loss": 4.3594, "step": 276 }, { "epoch": 0.03, "grad_norm": 2.4249457923798756, "learning_rate": 0.0009999972069010686, "loss": 4.1893, "step": 277 }, { "epoch": 0.03, "grad_norm": 4.819293487728759, "learning_rate": 0.0009999965517304673, "loss": 4.3919, "step": 278 }, { "epoch": 0.03, "grad_norm": 2.1107671692877013, "learning_rate": 0.0009999958275948725, "loss": 4.2951, "step": 279 }, { "epoch": 0.03, "grad_norm": 8.73677749460373, "learning_rate": 0.0009999950344943842, "loss": 4.4385, "step": 280 }, { "epoch": 0.03, "grad_norm": 4.05374890058115, "learning_rate": 0.0009999941724291115, "loss": 4.3969, "step": 281 }, { "epoch": 0.03, "grad_norm": 5.421257940385821, "learning_rate": 0.0009999932413991737, "loss": 4.34, "step": 282 }, { "epoch": 0.03, "grad_norm": 6.828704389126077, "learning_rate": 0.0009999922414046986, "loss": 4.5356, "step": 283 }, { "epoch": 0.03, "grad_norm": 2.0673160031126026, "learning_rate": 0.0009999911724458248, "loss": 4.4704, "step": 284 }, { "epoch": 0.03, "grad_norm": 2.3057595607561203, "learning_rate": 0.0009999900345226994, "loss": 4.4737, "step": 285 }, { "epoch": 0.03, "grad_norm": 1.9606063690862783, "learning_rate": 0.0009999888276354795, "loss": 4.3361, "step": 286 }, { "epoch": 0.03, "grad_norm": 4.528379990583092, "learning_rate": 0.0009999875517843315, "loss": 4.2096, "step": 287 }, { "epoch": 0.03, "grad_norm": 2.2204819397446744, "learning_rate": 0.0009999862069694312, "loss": 4.2857, "step": 288 }, { "epoch": 0.03, "grad_norm": 2.625037551590264, "learning_rate": 0.0009999847931909645, "loss": 4.4387, "step": 289 }, { "epoch": 0.03, "grad_norm": 1.831171221857622, "learning_rate": 0.000999983310449126, "loss": 4.1972, "step": 290 }, { "epoch": 0.03, "grad_norm": 3.6362060252404214, "learning_rate": 0.0009999817587441203, "loss": 4.2273, "step": 291 }, { "epoch": 0.03, "grad_norm": 2.2726150008550103, "learning_rate": 0.0009999801380761615, "loss": 4.2443, "step": 292 }, { "epoch": 0.03, "grad_norm": 4.074624444798479, "learning_rate": 0.0009999784484454734, "loss": 4.6289, "step": 293 }, { "epoch": 0.03, "grad_norm": 10.227047786644134, "learning_rate": 0.0009999766898522884, "loss": 4.3537, "step": 294 }, { "epoch": 0.03, "grad_norm": 4.2390564894381315, "learning_rate": 0.0009999748622968496, "loss": 4.2276, "step": 295 }, { "epoch": 0.03, "grad_norm": 2.4379596999010844, "learning_rate": 0.000999972965779409, "loss": 4.2626, "step": 296 }, { "epoch": 0.03, "grad_norm": 2.373799673214772, "learning_rate": 0.000999971000300228, "loss": 4.3846, "step": 297 }, { "epoch": 0.03, "grad_norm": 2.856821315629644, "learning_rate": 0.000999968965859578, "loss": 4.2642, "step": 298 }, { "epoch": 0.03, "grad_norm": 2.3722167840426427, "learning_rate": 0.0009999668624577395, "loss": 4.5474, "step": 299 }, { "epoch": 0.03, "grad_norm": 4.461104365722385, "learning_rate": 0.0009999646900950023, "loss": 4.6385, "step": 300 }, { "epoch": 0.03, "grad_norm": 1.869855563420753, "learning_rate": 0.0009999624487716666, "loss": 4.3475, "step": 301 }, { "epoch": 0.03, "grad_norm": 2.042584671628898, "learning_rate": 0.000999960138488041, "loss": 4.3183, "step": 302 }, { "epoch": 0.03, "grad_norm": 2.520559709960604, "learning_rate": 0.0009999577592444443, "loss": 4.2622, "step": 303 }, { "epoch": 0.03, "grad_norm": 2.62195137213332, "learning_rate": 0.000999955311041205, "loss": 4.2762, "step": 304 }, { "epoch": 0.03, "grad_norm": 1.9086962901730506, "learning_rate": 0.0009999527938786606, "loss": 4.5629, "step": 305 }, { "epoch": 0.04, "grad_norm": 1.8038421215919183, "learning_rate": 0.0009999502077571581, "loss": 4.0871, "step": 306 }, { "epoch": 0.04, "grad_norm": 1.7833722192645467, "learning_rate": 0.0009999475526770545, "loss": 4.4777, "step": 307 }, { "epoch": 0.04, "grad_norm": 1.8478429683980375, "learning_rate": 0.0009999448286387158, "loss": 4.2395, "step": 308 }, { "epoch": 0.04, "grad_norm": 1.7454565596847331, "learning_rate": 0.0009999420356425178, "loss": 4.5417, "step": 309 }, { "epoch": 0.04, "grad_norm": 1.9479769325562462, "learning_rate": 0.0009999391736888457, "loss": 4.26, "step": 310 }, { "epoch": 0.04, "grad_norm": 1.6470242227872336, "learning_rate": 0.0009999362427780942, "loss": 4.1583, "step": 311 }, { "epoch": 0.04, "grad_norm": 3.263975490281977, "learning_rate": 0.0009999332429106679, "loss": 4.1444, "step": 312 }, { "epoch": 0.04, "grad_norm": 1.7699800239020929, "learning_rate": 0.00099993017408698, "loss": 4.2735, "step": 313 }, { "epoch": 0.04, "grad_norm": 1.8257742883377097, "learning_rate": 0.0009999270363074547, "loss": 4.203, "step": 314 }, { "epoch": 0.04, "grad_norm": 10.691140861191744, "learning_rate": 0.0009999238295725237, "loss": 4.2063, "step": 315 }, { "epoch": 0.04, "grad_norm": 2.622993729167949, "learning_rate": 0.00099992055388263, "loss": 4.4137, "step": 316 }, { "epoch": 0.04, "grad_norm": 1.9065694661952346, "learning_rate": 0.0009999172092382252, "loss": 4.4522, "step": 317 }, { "epoch": 0.04, "grad_norm": 1.7455840755342873, "learning_rate": 0.0009999137956397707, "loss": 4.2275, "step": 318 }, { "epoch": 0.04, "grad_norm": 1.6173218058236567, "learning_rate": 0.0009999103130877373, "loss": 4.2826, "step": 319 }, { "epoch": 0.04, "grad_norm": 1.8245320160262717, "learning_rate": 0.0009999067615826054, "loss": 4.4055, "step": 320 }, { "epoch": 0.04, "grad_norm": 1.849797482883143, "learning_rate": 0.000999903141124865, "loss": 4.1618, "step": 321 }, { "epoch": 0.04, "grad_norm": 6.027741799392095, "learning_rate": 0.000999899451715015, "loss": 4.1887, "step": 322 }, { "epoch": 0.04, "grad_norm": 1.8938322381125947, "learning_rate": 0.0009998956933535649, "loss": 4.4724, "step": 323 }, { "epoch": 0.04, "grad_norm": 3.14582685414885, "learning_rate": 0.0009998918660410324, "loss": 4.2194, "step": 324 }, { "epoch": 0.04, "grad_norm": 1.876924450559067, "learning_rate": 0.000999887969777946, "loss": 4.4041, "step": 325 }, { "epoch": 0.04, "grad_norm": 2.390883188029618, "learning_rate": 0.000999884004564843, "loss": 4.3214, "step": 326 }, { "epoch": 0.04, "grad_norm": 5.297007972611703, "learning_rate": 0.00099987997040227, "loss": 4.147, "step": 327 }, { "epoch": 0.04, "grad_norm": 8.385437012240843, "learning_rate": 0.0009998758672907838, "loss": 4.2884, "step": 328 }, { "epoch": 0.04, "grad_norm": 2.414164727999012, "learning_rate": 0.0009998716952309501, "loss": 4.3311, "step": 329 }, { "epoch": 0.04, "grad_norm": 1.7864078039354472, "learning_rate": 0.0009998674542233445, "loss": 4.0562, "step": 330 }, { "epoch": 0.04, "grad_norm": 1.9217939924899319, "learning_rate": 0.000999863144268552, "loss": 4.3897, "step": 331 }, { "epoch": 0.04, "grad_norm": 2.4365590695993102, "learning_rate": 0.000999858765367167, "loss": 4.1876, "step": 332 }, { "epoch": 0.04, "grad_norm": 1.68524828347413, "learning_rate": 0.0009998543175197936, "loss": 4.2725, "step": 333 }, { "epoch": 0.04, "grad_norm": 2.1054903880856353, "learning_rate": 0.000999849800727045, "loss": 4.3586, "step": 334 }, { "epoch": 0.04, "grad_norm": 2.3470789148104543, "learning_rate": 0.0009998452149895445, "loss": 4.2405, "step": 335 }, { "epoch": 0.04, "grad_norm": 1.5167190857954032, "learning_rate": 0.0009998405603079243, "loss": 4.3523, "step": 336 }, { "epoch": 0.04, "grad_norm": 4.013670800366627, "learning_rate": 0.0009998358366828269, "loss": 4.3284, "step": 337 }, { "epoch": 0.04, "grad_norm": 3.137507221222173, "learning_rate": 0.0009998310441149034, "loss": 4.3924, "step": 338 }, { "epoch": 0.04, "grad_norm": 2.392495038801323, "learning_rate": 0.000999826182604815, "loss": 4.3711, "step": 339 }, { "epoch": 0.04, "grad_norm": 3.5545150008638227, "learning_rate": 0.0009998212521532325, "loss": 4.2667, "step": 340 }, { "epoch": 0.04, "grad_norm": 1.8697294274015155, "learning_rate": 0.0009998162527608354, "loss": 4.2159, "step": 341 }, { "epoch": 0.04, "grad_norm": 3.34750623048148, "learning_rate": 0.0009998111844283137, "loss": 4.4183, "step": 342 }, { "epoch": 0.04, "grad_norm": 1.9396733830579354, "learning_rate": 0.0009998060471563665, "loss": 4.4659, "step": 343 }, { "epoch": 0.04, "grad_norm": 1.8858733848032796, "learning_rate": 0.0009998008409457023, "loss": 4.3332, "step": 344 }, { "epoch": 0.04, "grad_norm": 8.828833026494243, "learning_rate": 0.000999795565797039, "loss": 4.0625, "step": 345 }, { "epoch": 0.04, "grad_norm": 1.709332950640674, "learning_rate": 0.0009997902217111045, "loss": 4.0983, "step": 346 }, { "epoch": 0.04, "grad_norm": 1.9208826877405047, "learning_rate": 0.0009997848086886357, "loss": 4.3697, "step": 347 }, { "epoch": 0.04, "grad_norm": 1.62645722896435, "learning_rate": 0.0009997793267303792, "loss": 4.2086, "step": 348 }, { "epoch": 0.04, "grad_norm": 1.9815729985870292, "learning_rate": 0.0009997737758370914, "loss": 4.3787, "step": 349 }, { "epoch": 0.04, "grad_norm": 1.780445603437587, "learning_rate": 0.0009997681560095378, "loss": 4.2932, "step": 350 }, { "epoch": 0.04, "grad_norm": 4.176947035502542, "learning_rate": 0.0009997624672484933, "loss": 4.2574, "step": 351 }, { "epoch": 0.04, "grad_norm": 1.7735079933837956, "learning_rate": 0.0009997567095547432, "loss": 4.3329, "step": 352 }, { "epoch": 0.04, "grad_norm": 2.0269833428455115, "learning_rate": 0.000999750882929081, "loss": 4.3568, "step": 353 }, { "epoch": 0.04, "grad_norm": 1.9251298739654332, "learning_rate": 0.0009997449873723105, "loss": 4.2947, "step": 354 }, { "epoch": 0.04, "grad_norm": 2.200350577671691, "learning_rate": 0.000999739022885245, "loss": 4.3672, "step": 355 }, { "epoch": 0.04, "grad_norm": 2.149172821403982, "learning_rate": 0.0009997329894687072, "loss": 4.2773, "step": 356 }, { "epoch": 0.04, "grad_norm": 1.763960309192054, "learning_rate": 0.0009997268871235296, "loss": 4.2949, "step": 357 }, { "epoch": 0.04, "grad_norm": 1.600426239130483, "learning_rate": 0.0009997207158505533, "loss": 4.3441, "step": 358 }, { "epoch": 0.04, "grad_norm": 2.9551041379575542, "learning_rate": 0.0009997144756506298, "loss": 4.3803, "step": 359 }, { "epoch": 0.04, "grad_norm": 1.531793568367533, "learning_rate": 0.00099970816652462, "loss": 4.2699, "step": 360 }, { "epoch": 0.04, "grad_norm": 1.7973840385526973, "learning_rate": 0.0009997017884733938, "loss": 4.2753, "step": 361 }, { "epoch": 0.04, "grad_norm": 2.7995611818018387, "learning_rate": 0.000999695341497831, "loss": 4.1094, "step": 362 }, { "epoch": 0.04, "grad_norm": 1.7991061456979116, "learning_rate": 0.0009996888255988207, "loss": 4.3081, "step": 363 }, { "epoch": 0.04, "grad_norm": 2.351852600194795, "learning_rate": 0.0009996822407772623, "loss": 4.249, "step": 364 }, { "epoch": 0.04, "grad_norm": 1.8309715923194394, "learning_rate": 0.0009996755870340633, "loss": 4.2085, "step": 365 }, { "epoch": 0.04, "grad_norm": 11.363773271735596, "learning_rate": 0.0009996688643701419, "loss": 4.4026, "step": 366 }, { "epoch": 0.04, "grad_norm": 4.296610447883313, "learning_rate": 0.0009996620727864252, "loss": 4.3306, "step": 367 }, { "epoch": 0.04, "grad_norm": 1.5541660637803594, "learning_rate": 0.00099965521228385, "loss": 4.0953, "step": 368 }, { "epoch": 0.04, "grad_norm": 2.258459893263655, "learning_rate": 0.0009996482828633624, "loss": 4.3703, "step": 369 }, { "epoch": 0.04, "grad_norm": 2.236042899052531, "learning_rate": 0.0009996412845259183, "loss": 4.3142, "step": 370 }, { "epoch": 0.04, "grad_norm": 1.6625819652450904, "learning_rate": 0.0009996342172724833, "loss": 4.3278, "step": 371 }, { "epoch": 0.04, "grad_norm": 1.6672033979225918, "learning_rate": 0.0009996270811040318, "loss": 4.1417, "step": 372 }, { "epoch": 0.04, "grad_norm": 1.6005486453350695, "learning_rate": 0.0009996198760215483, "loss": 3.9964, "step": 373 }, { "epoch": 0.04, "grad_norm": 3.263713271734624, "learning_rate": 0.0009996126020260262, "loss": 4.3397, "step": 374 }, { "epoch": 0.04, "grad_norm": 6.042613390850423, "learning_rate": 0.0009996052591184695, "loss": 4.3059, "step": 375 }, { "epoch": 0.04, "grad_norm": 2.019759351530601, "learning_rate": 0.0009995978472998905, "loss": 4.4018, "step": 376 }, { "epoch": 0.04, "grad_norm": 2.6550182571081407, "learning_rate": 0.0009995903665713118, "loss": 4.3145, "step": 377 }, { "epoch": 0.04, "grad_norm": 1.4386895694884438, "learning_rate": 0.000999582816933765, "loss": 4.055, "step": 378 }, { "epoch": 0.04, "grad_norm": 3.9893663873029506, "learning_rate": 0.0009995751983882914, "loss": 4.2387, "step": 379 }, { "epoch": 0.04, "grad_norm": 1.6346449655390993, "learning_rate": 0.000999567510935942, "loss": 4.4161, "step": 380 }, { "epoch": 0.04, "grad_norm": 2.1257868823161052, "learning_rate": 0.0009995597545777771, "loss": 4.2795, "step": 381 }, { "epoch": 0.04, "grad_norm": 1.6307367926860996, "learning_rate": 0.0009995519293148666, "loss": 4.1981, "step": 382 }, { "epoch": 0.04, "grad_norm": 13.013016741828519, "learning_rate": 0.0009995440351482897, "loss": 4.3229, "step": 383 }, { "epoch": 0.04, "grad_norm": 2.4479239768136543, "learning_rate": 0.0009995360720791353, "loss": 4.3973, "step": 384 }, { "epoch": 0.04, "grad_norm": 2.588776467371206, "learning_rate": 0.000999528040108502, "loss": 4.2682, "step": 385 }, { "epoch": 0.04, "grad_norm": 2.0044950076119203, "learning_rate": 0.0009995199392374972, "loss": 4.384, "step": 386 }, { "epoch": 0.04, "grad_norm": 1.8058105172185372, "learning_rate": 0.0009995117694672386, "loss": 4.3045, "step": 387 }, { "epoch": 0.04, "grad_norm": 1.6770517417622641, "learning_rate": 0.000999503530798853, "loss": 4.3202, "step": 388 }, { "epoch": 0.04, "grad_norm": 1.700323995527632, "learning_rate": 0.0009994952232334766, "loss": 4.3961, "step": 389 }, { "epoch": 0.04, "grad_norm": 3.313482945509579, "learning_rate": 0.0009994868467722556, "loss": 4.2835, "step": 390 }, { "epoch": 0.04, "grad_norm": 1.700153990552024, "learning_rate": 0.0009994784014163449, "loss": 4.1578, "step": 391 }, { "epoch": 0.04, "grad_norm": 1.5695833672737198, "learning_rate": 0.0009994698871669098, "loss": 4.3772, "step": 392 }, { "epoch": 0.05, "grad_norm": 2.212631354821127, "learning_rate": 0.0009994613040251246, "loss": 4.3751, "step": 393 }, { "epoch": 0.05, "grad_norm": 2.5348657822030423, "learning_rate": 0.000999452651992173, "loss": 4.4453, "step": 394 }, { "epoch": 0.05, "grad_norm": 1.6149432901585588, "learning_rate": 0.0009994439310692486, "loss": 3.9841, "step": 395 }, { "epoch": 0.05, "grad_norm": 2.666105294662668, "learning_rate": 0.0009994351412575542, "loss": 4.1464, "step": 396 }, { "epoch": 0.05, "grad_norm": 1.5783382385561004, "learning_rate": 0.000999426282558302, "loss": 4.3031, "step": 397 }, { "epoch": 0.05, "grad_norm": 18.503233663160536, "learning_rate": 0.000999417354972714, "loss": 4.1477, "step": 398 }, { "epoch": 0.05, "grad_norm": 1.9295979586677223, "learning_rate": 0.000999408358502022, "loss": 4.2777, "step": 399 }, { "epoch": 0.05, "grad_norm": 13.271562247852382, "learning_rate": 0.0009993992931474661, "loss": 4.393, "step": 400 }, { "epoch": 0.05, "grad_norm": 1.5272794660191136, "learning_rate": 0.0009993901589102974, "loss": 4.3466, "step": 401 }, { "epoch": 0.05, "grad_norm": 1.7289722946784687, "learning_rate": 0.0009993809557917754, "loss": 4.2126, "step": 402 }, { "epoch": 0.05, "grad_norm": 1.7436962197198087, "learning_rate": 0.0009993716837931696, "loss": 4.3352, "step": 403 }, { "epoch": 0.05, "grad_norm": 1.7667146316934688, "learning_rate": 0.000999362342915759, "loss": 4.3641, "step": 404 }, { "epoch": 0.05, "grad_norm": 1.8851738338170139, "learning_rate": 0.0009993529331608318, "loss": 4.3149, "step": 405 }, { "epoch": 0.05, "grad_norm": 1.841906870998191, "learning_rate": 0.0009993434545296862, "loss": 4.0531, "step": 406 }, { "epoch": 0.05, "grad_norm": 1.7948831816836226, "learning_rate": 0.0009993339070236292, "loss": 4.1343, "step": 407 }, { "epoch": 0.05, "grad_norm": 2.1275079839481053, "learning_rate": 0.000999324290643978, "loss": 4.0496, "step": 408 }, { "epoch": 0.05, "grad_norm": 1.4818385050433656, "learning_rate": 0.0009993146053920588, "loss": 3.9573, "step": 409 }, { "epoch": 0.05, "grad_norm": 1.752521535899123, "learning_rate": 0.0009993048512692078, "loss": 3.9854, "step": 410 }, { "epoch": 0.05, "grad_norm": 1.8337362018615744, "learning_rate": 0.00099929502827677, "loss": 4.3413, "step": 411 }, { "epoch": 0.05, "grad_norm": 2.0336218035934754, "learning_rate": 0.0009992851364161006, "loss": 4.3587, "step": 412 }, { "epoch": 0.05, "grad_norm": 1.915565119571634, "learning_rate": 0.0009992751756885637, "loss": 4.276, "step": 413 }, { "epoch": 0.05, "grad_norm": 1.9546455153324622, "learning_rate": 0.0009992651460955335, "loss": 4.3891, "step": 414 }, { "epoch": 0.05, "grad_norm": 2.4577745924073646, "learning_rate": 0.0009992550476383931, "loss": 4.2087, "step": 415 }, { "epoch": 0.05, "grad_norm": 3.2691215162275262, "learning_rate": 0.0009992448803185356, "loss": 4.3393, "step": 416 }, { "epoch": 0.05, "grad_norm": 2.610639310338842, "learning_rate": 0.0009992346441373633, "loss": 4.0624, "step": 417 }, { "epoch": 0.05, "grad_norm": 1.9626820277896326, "learning_rate": 0.0009992243390962883, "loss": 4.352, "step": 418 }, { "epoch": 0.05, "grad_norm": 1.9064052282501875, "learning_rate": 0.0009992139651967319, "loss": 4.2696, "step": 419 }, { "epoch": 0.05, "grad_norm": 1.7362326839713969, "learning_rate": 0.0009992035224401245, "loss": 4.2131, "step": 420 }, { "epoch": 0.05, "grad_norm": 2.4089719382354127, "learning_rate": 0.0009991930108279074, "loss": 4.2663, "step": 421 }, { "epoch": 0.05, "grad_norm": 2.1180114937174173, "learning_rate": 0.0009991824303615293, "loss": 4.2941, "step": 422 }, { "epoch": 0.05, "grad_norm": 2.0291897225301434, "learning_rate": 0.0009991717810424506, "loss": 4.2532, "step": 423 }, { "epoch": 0.05, "grad_norm": 2.166154642141422, "learning_rate": 0.0009991610628721397, "loss": 4.4652, "step": 424 }, { "epoch": 0.05, "grad_norm": 2.0703801204957784, "learning_rate": 0.000999150275852075, "loss": 4.2655, "step": 425 }, { "epoch": 0.05, "grad_norm": 2.2162140820179754, "learning_rate": 0.0009991394199837444, "loss": 4.1911, "step": 426 }, { "epoch": 0.05, "grad_norm": 5.238083125786115, "learning_rate": 0.0009991284952686455, "loss": 3.9587, "step": 427 }, { "epoch": 0.05, "grad_norm": 2.849326714920507, "learning_rate": 0.0009991175017082848, "loss": 4.0868, "step": 428 }, { "epoch": 0.05, "grad_norm": 1.799034401256162, "learning_rate": 0.0009991064393041786, "loss": 4.2251, "step": 429 }, { "epoch": 0.05, "grad_norm": 1.9174502152936364, "learning_rate": 0.0009990953080578533, "loss": 4.2895, "step": 430 }, { "epoch": 0.05, "grad_norm": 2.203659418977307, "learning_rate": 0.0009990841079708435, "loss": 4.2059, "step": 431 }, { "epoch": 0.05, "grad_norm": 1.4663425647192094, "learning_rate": 0.0009990728390446946, "loss": 4.0535, "step": 432 }, { "epoch": 0.05, "grad_norm": 1.7921253380631696, "learning_rate": 0.0009990615012809608, "loss": 4.0972, "step": 433 }, { "epoch": 0.05, "grad_norm": 2.9260420989693356, "learning_rate": 0.0009990500946812058, "loss": 4.4988, "step": 434 }, { "epoch": 0.05, "grad_norm": 1.6175345447347704, "learning_rate": 0.000999038619247003, "loss": 4.1891, "step": 435 }, { "epoch": 0.05, "grad_norm": 1.951230819992521, "learning_rate": 0.0009990270749799352, "loss": 4.358, "step": 436 }, { "epoch": 0.05, "grad_norm": 1.757947677780169, "learning_rate": 0.0009990154618815948, "loss": 4.0558, "step": 437 }, { "epoch": 0.05, "grad_norm": 1.7981267884084104, "learning_rate": 0.0009990037799535833, "loss": 4.4319, "step": 438 }, { "epoch": 0.05, "grad_norm": 1.6343902102574257, "learning_rate": 0.0009989920291975124, "loss": 4.2578, "step": 439 }, { "epoch": 0.05, "grad_norm": 1.534702683399374, "learning_rate": 0.0009989802096150029, "loss": 4.4104, "step": 440 }, { "epoch": 0.05, "grad_norm": 1.8792229365270614, "learning_rate": 0.0009989683212076848, "loss": 4.1472, "step": 441 }, { "epoch": 0.05, "grad_norm": 2.2787236284817713, "learning_rate": 0.0009989563639771978, "loss": 4.2353, "step": 442 }, { "epoch": 0.05, "grad_norm": 2.1591237765238414, "learning_rate": 0.0009989443379251916, "loss": 4.2647, "step": 443 }, { "epoch": 0.05, "grad_norm": 3.543001898973933, "learning_rate": 0.0009989322430533245, "loss": 4.4519, "step": 444 }, { "epoch": 0.05, "grad_norm": 1.8949118389309028, "learning_rate": 0.0009989200793632652, "loss": 4.0826, "step": 445 }, { "epoch": 0.05, "grad_norm": 1.5129090931885067, "learning_rate": 0.0009989078468566912, "loss": 4.149, "step": 446 }, { "epoch": 0.05, "grad_norm": 1.5388328739408674, "learning_rate": 0.0009988955455352898, "loss": 4.3074, "step": 447 }, { "epoch": 0.05, "grad_norm": 1.4659995915610111, "learning_rate": 0.0009988831754007576, "loss": 4.2362, "step": 448 }, { "epoch": 0.05, "grad_norm": 1.964430049487941, "learning_rate": 0.000998870736454801, "loss": 3.9881, "step": 449 }, { "epoch": 0.05, "grad_norm": 2.028052731206351, "learning_rate": 0.0009988582286991356, "loss": 4.247, "step": 450 }, { "epoch": 0.05, "grad_norm": 1.3723451750133668, "learning_rate": 0.0009988456521354868, "loss": 4.1483, "step": 451 }, { "epoch": 0.05, "grad_norm": 1.9538192868375026, "learning_rate": 0.000998833006765589, "loss": 4.141, "step": 452 }, { "epoch": 0.05, "grad_norm": 1.5261153003688364, "learning_rate": 0.0009988202925911864, "loss": 4.2694, "step": 453 }, { "epoch": 0.05, "grad_norm": 3.501750158447568, "learning_rate": 0.000998807509614033, "loss": 4.3445, "step": 454 }, { "epoch": 0.05, "grad_norm": 4.669386668607383, "learning_rate": 0.0009987946578358918, "loss": 4.2316, "step": 455 }, { "epoch": 0.05, "grad_norm": 1.3746858376644284, "learning_rate": 0.0009987817372585355, "loss": 4.0968, "step": 456 }, { "epoch": 0.05, "grad_norm": 1.5880511855022112, "learning_rate": 0.000998768747883746, "loss": 4.1286, "step": 457 }, { "epoch": 0.05, "grad_norm": 1.6454582585077993, "learning_rate": 0.0009987556897133151, "loss": 4.1812, "step": 458 }, { "epoch": 0.05, "grad_norm": 2.207199443899028, "learning_rate": 0.0009987425627490441, "loss": 4.3578, "step": 459 }, { "epoch": 0.05, "grad_norm": 6.218611870147262, "learning_rate": 0.0009987293669927436, "loss": 4.144, "step": 460 }, { "epoch": 0.05, "grad_norm": 2.8647623585297053, "learning_rate": 0.0009987161024462333, "loss": 4.4358, "step": 461 }, { "epoch": 0.05, "grad_norm": 1.5780177190999307, "learning_rate": 0.0009987027691113432, "loss": 4.2938, "step": 462 }, { "epoch": 0.05, "grad_norm": 3.643935270796855, "learning_rate": 0.0009986893669899123, "loss": 4.0997, "step": 463 }, { "epoch": 0.05, "grad_norm": 1.3913137624213265, "learning_rate": 0.0009986758960837889, "loss": 4.3316, "step": 464 }, { "epoch": 0.05, "grad_norm": 17.9844694047189, "learning_rate": 0.0009986623563948314, "loss": 4.2917, "step": 465 }, { "epoch": 0.05, "grad_norm": 5.452731477895287, "learning_rate": 0.000998648747924907, "loss": 4.3958, "step": 466 }, { "epoch": 0.05, "grad_norm": 3.0000241153277587, "learning_rate": 0.0009986350706758934, "loss": 4.3869, "step": 467 }, { "epoch": 0.05, "grad_norm": 2.741074881401537, "learning_rate": 0.0009986213246496762, "loss": 4.3957, "step": 468 }, { "epoch": 0.05, "grad_norm": 2.0878764553363762, "learning_rate": 0.000998607509848152, "loss": 4.2206, "step": 469 }, { "epoch": 0.05, "grad_norm": 6.877893107138887, "learning_rate": 0.0009985936262732263, "loss": 4.2682, "step": 470 }, { "epoch": 0.05, "grad_norm": 4.919548057984632, "learning_rate": 0.0009985796739268138, "loss": 4.3031, "step": 471 }, { "epoch": 0.05, "grad_norm": 2.477972947699397, "learning_rate": 0.000998565652810839, "loss": 3.9013, "step": 472 }, { "epoch": 0.05, "grad_norm": 2.6115640159806115, "learning_rate": 0.000998551562927236, "loss": 4.3149, "step": 473 }, { "epoch": 0.05, "grad_norm": 1.6926204004652123, "learning_rate": 0.000998537404277948, "loss": 4.0781, "step": 474 }, { "epoch": 0.05, "grad_norm": 3.503964600656895, "learning_rate": 0.0009985231768649284, "loss": 4.2242, "step": 475 }, { "epoch": 0.05, "grad_norm": 4.070943383900958, "learning_rate": 0.000998508880690139, "loss": 4.2951, "step": 476 }, { "epoch": 0.05, "grad_norm": 2.9894425687749657, "learning_rate": 0.000998494515755552, "loss": 4.1612, "step": 477 }, { "epoch": 0.05, "grad_norm": 2.1248217467194497, "learning_rate": 0.0009984800820631488, "loss": 4.3319, "step": 478 }, { "epoch": 0.05, "grad_norm": 5.950256293847512, "learning_rate": 0.0009984655796149201, "loss": 4.3368, "step": 479 }, { "epoch": 0.06, "grad_norm": 2.0689678535102844, "learning_rate": 0.0009984510084128661, "loss": 4.2332, "step": 480 }, { "epoch": 0.06, "grad_norm": 1.7092029509044695, "learning_rate": 0.0009984363684589972, "loss": 4.1943, "step": 481 }, { "epoch": 0.06, "grad_norm": 1.6521065901800434, "learning_rate": 0.0009984216597553322, "loss": 4.2568, "step": 482 }, { "epoch": 0.06, "grad_norm": 1.4686265224467994, "learning_rate": 0.0009984068823039, "loss": 4.2743, "step": 483 }, { "epoch": 0.06, "grad_norm": 1.4835487141014911, "learning_rate": 0.0009983920361067388, "loss": 4.1603, "step": 484 }, { "epoch": 0.06, "grad_norm": 1.93649156069073, "learning_rate": 0.0009983771211658965, "loss": 4.2445, "step": 485 }, { "epoch": 0.06, "grad_norm": 1.873355008629887, "learning_rate": 0.0009983621374834303, "loss": 4.2099, "step": 486 }, { "epoch": 0.06, "grad_norm": 2.3370407304330634, "learning_rate": 0.0009983470850614068, "loss": 4.0677, "step": 487 }, { "epoch": 0.06, "grad_norm": 6.253559872165097, "learning_rate": 0.0009983319639019024, "loss": 4.0745, "step": 488 }, { "epoch": 0.06, "grad_norm": 6.492835046760343, "learning_rate": 0.0009983167740070025, "loss": 4.2605, "step": 489 }, { "epoch": 0.06, "grad_norm": 2.746142226999182, "learning_rate": 0.0009983015153788026, "loss": 4.161, "step": 490 }, { "epoch": 0.06, "grad_norm": 1.5639655189064725, "learning_rate": 0.000998286188019407, "loss": 4.2603, "step": 491 }, { "epoch": 0.06, "grad_norm": 2.350633150171923, "learning_rate": 0.00099827079193093, "loss": 4.1694, "step": 492 }, { "epoch": 0.06, "grad_norm": 1.6565380199138984, "learning_rate": 0.0009982553271154953, "loss": 4.192, "step": 493 }, { "epoch": 0.06, "grad_norm": 2.2705613707110097, "learning_rate": 0.0009982397935752356, "loss": 4.1158, "step": 494 }, { "epoch": 0.06, "grad_norm": 18.84429440619059, "learning_rate": 0.0009982241913122937, "loss": 4.0334, "step": 495 }, { "epoch": 0.06, "grad_norm": 3.055665035215563, "learning_rate": 0.000998208520328822, "loss": 4.4203, "step": 496 }, { "epoch": 0.06, "grad_norm": 1.441392501225691, "learning_rate": 0.0009981927806269812, "loss": 4.0862, "step": 497 }, { "epoch": 0.06, "grad_norm": 1.5577449715134128, "learning_rate": 0.0009981769722089428, "loss": 4.4479, "step": 498 }, { "epoch": 0.06, "grad_norm": 2.4614879040548208, "learning_rate": 0.0009981610950768873, "loss": 4.125, "step": 499 }, { "epoch": 0.06, "grad_norm": 1.6579332687518427, "learning_rate": 0.0009981451492330046, "loss": 4.3426, "step": 500 }, { "epoch": 0.06, "grad_norm": 7.369106904230558, "learning_rate": 0.000998129134679494, "loss": 4.2844, "step": 501 }, { "epoch": 0.06, "grad_norm": 1.7276112174033893, "learning_rate": 0.0009981130514185646, "loss": 4.1629, "step": 502 }, { "epoch": 0.06, "grad_norm": 1.5343655248890107, "learning_rate": 0.0009980968994524344, "loss": 4.1751, "step": 503 }, { "epoch": 0.06, "grad_norm": 1.8092352613409772, "learning_rate": 0.0009980806787833316, "loss": 4.4037, "step": 504 }, { "epoch": 0.06, "grad_norm": 2.018234558118657, "learning_rate": 0.0009980643894134935, "loss": 4.2689, "step": 505 }, { "epoch": 0.06, "grad_norm": 4.532672358406873, "learning_rate": 0.000998048031345167, "loss": 4.1466, "step": 506 }, { "epoch": 0.06, "grad_norm": 2.357779531031464, "learning_rate": 0.0009980316045806082, "loss": 4.0676, "step": 507 }, { "epoch": 0.06, "grad_norm": 4.672931895119673, "learning_rate": 0.0009980151091220826, "loss": 4.2032, "step": 508 }, { "epoch": 0.06, "grad_norm": 3.0914372512184762, "learning_rate": 0.000997998544971866, "loss": 4.4462, "step": 509 }, { "epoch": 0.06, "grad_norm": 2.4854638450210507, "learning_rate": 0.0009979819121322426, "loss": 4.1935, "step": 510 }, { "epoch": 0.06, "grad_norm": 2.291677108105903, "learning_rate": 0.000997965210605507, "loss": 4.2671, "step": 511 }, { "epoch": 0.06, "grad_norm": 2.6819995552127747, "learning_rate": 0.0009979484403939626, "loss": 3.9394, "step": 512 }, { "epoch": 0.06, "grad_norm": 1.692863536077385, "learning_rate": 0.0009979316014999226, "loss": 4.1667, "step": 513 }, { "epoch": 0.06, "grad_norm": 2.0660018132123836, "learning_rate": 0.0009979146939257098, "loss": 4.1103, "step": 514 }, { "epoch": 0.06, "grad_norm": 1.6542981921664894, "learning_rate": 0.000997897717673656, "loss": 4.1394, "step": 515 }, { "epoch": 0.06, "grad_norm": 2.0758964332935688, "learning_rate": 0.0009978806727461028, "loss": 4.1829, "step": 516 }, { "epoch": 0.06, "grad_norm": 2.078159191925465, "learning_rate": 0.000997863559145401, "loss": 4.3216, "step": 517 }, { "epoch": 0.06, "grad_norm": 1.6900710796975273, "learning_rate": 0.0009978463768739118, "loss": 4.16, "step": 518 }, { "epoch": 0.06, "grad_norm": 3.3432192379163337, "learning_rate": 0.0009978291259340045, "loss": 3.9388, "step": 519 }, { "epoch": 0.06, "grad_norm": 14.117045065668476, "learning_rate": 0.0009978118063280587, "loss": 4.3009, "step": 520 }, { "epoch": 0.06, "grad_norm": 3.246089543413055, "learning_rate": 0.0009977944180584637, "loss": 3.9631, "step": 521 }, { "epoch": 0.06, "grad_norm": 2.8200497924042276, "learning_rate": 0.0009977769611276173, "loss": 4.134, "step": 522 }, { "epoch": 0.06, "grad_norm": 1.6167132481924043, "learning_rate": 0.0009977594355379275, "loss": 4.081, "step": 523 }, { "epoch": 0.06, "grad_norm": 8.680359288221759, "learning_rate": 0.000997741841291812, "loss": 4.1658, "step": 524 }, { "epoch": 0.06, "grad_norm": 2.126819006450614, "learning_rate": 0.000997724178391697, "loss": 4.3393, "step": 525 }, { "epoch": 0.06, "grad_norm": 1.8528031239121752, "learning_rate": 0.0009977064468400193, "loss": 4.2447, "step": 526 }, { "epoch": 0.06, "grad_norm": 1.8353821028784003, "learning_rate": 0.0009976886466392244, "loss": 4.266, "step": 527 }, { "epoch": 0.06, "grad_norm": 3.6211498853413175, "learning_rate": 0.0009976707777917676, "loss": 4.1355, "step": 528 }, { "epoch": 0.06, "grad_norm": 1.8929175801964857, "learning_rate": 0.0009976528403001133, "loss": 4.0475, "step": 529 }, { "epoch": 0.06, "grad_norm": 1.694986958117957, "learning_rate": 0.0009976348341667358, "loss": 4.2859, "step": 530 }, { "epoch": 0.06, "grad_norm": 2.5402945005582778, "learning_rate": 0.0009976167593941188, "loss": 4.3683, "step": 531 }, { "epoch": 0.06, "grad_norm": 1.64245041374947, "learning_rate": 0.000997598615984755, "loss": 4.2393, "step": 532 }, { "epoch": 0.06, "grad_norm": 3.728491005393638, "learning_rate": 0.0009975804039411475, "loss": 4.3714, "step": 533 }, { "epoch": 0.06, "grad_norm": 1.6830694570151084, "learning_rate": 0.0009975621232658082, "loss": 4.2365, "step": 534 }, { "epoch": 0.06, "grad_norm": 2.096126411060897, "learning_rate": 0.000997543773961258, "loss": 4.2833, "step": 535 }, { "epoch": 0.06, "grad_norm": 1.8034169738351644, "learning_rate": 0.0009975253560300283, "loss": 4.0742, "step": 536 }, { "epoch": 0.06, "grad_norm": 2.9025777342393484, "learning_rate": 0.0009975068694746596, "loss": 4.3164, "step": 537 }, { "epoch": 0.06, "grad_norm": 2.9296567816674317, "learning_rate": 0.0009974883142977015, "loss": 4.4549, "step": 538 }, { "epoch": 0.06, "grad_norm": 1.567127941894126, "learning_rate": 0.0009974696905017135, "loss": 4.0583, "step": 539 }, { "epoch": 0.06, "grad_norm": 1.8638593263114727, "learning_rate": 0.0009974509980892642, "loss": 4.1952, "step": 540 }, { "epoch": 0.06, "grad_norm": 1.6550109826024293, "learning_rate": 0.0009974322370629321, "loss": 4.3037, "step": 541 }, { "epoch": 0.06, "grad_norm": 3.121632740302096, "learning_rate": 0.000997413407425305, "loss": 4.1832, "step": 542 }, { "epoch": 0.06, "grad_norm": 2.068447554198298, "learning_rate": 0.0009973945091789796, "loss": 4.2553, "step": 543 }, { "epoch": 0.06, "grad_norm": 2.910179266342006, "learning_rate": 0.000997375542326563, "loss": 4.2223, "step": 544 }, { "epoch": 0.06, "grad_norm": 1.5284733032009923, "learning_rate": 0.0009973565068706711, "loss": 4.0156, "step": 545 }, { "epoch": 0.06, "grad_norm": 1.7510863726112291, "learning_rate": 0.0009973374028139296, "loss": 4.2565, "step": 546 }, { "epoch": 0.06, "grad_norm": 1.9848094530248792, "learning_rate": 0.0009973182301589736, "loss": 4.3132, "step": 547 }, { "epoch": 0.06, "grad_norm": 1.6381911471511024, "learning_rate": 0.0009972989889084473, "loss": 4.0916, "step": 548 }, { "epoch": 0.06, "grad_norm": 2.4405959601297056, "learning_rate": 0.000997279679065005, "loss": 4.1835, "step": 549 }, { "epoch": 0.06, "grad_norm": 1.5295386302663294, "learning_rate": 0.0009972603006313098, "loss": 4.404, "step": 550 }, { "epoch": 0.06, "grad_norm": 1.6385055148611942, "learning_rate": 0.000997240853610035, "loss": 4.1902, "step": 551 }, { "epoch": 0.06, "grad_norm": 1.7454511130832677, "learning_rate": 0.0009972213380038627, "loss": 4.1052, "step": 552 }, { "epoch": 0.06, "grad_norm": 1.6681393752840339, "learning_rate": 0.0009972017538154845, "loss": 4.2802, "step": 553 }, { "epoch": 0.06, "grad_norm": 1.6040281330057087, "learning_rate": 0.000997182101047602, "loss": 4.0489, "step": 554 }, { "epoch": 0.06, "grad_norm": 3.4030647217117673, "learning_rate": 0.0009971623797029258, "loss": 4.1567, "step": 555 }, { "epoch": 0.06, "grad_norm": 1.5666576187938674, "learning_rate": 0.0009971425897841765, "loss": 4.0752, "step": 556 }, { "epoch": 0.06, "grad_norm": 2.4258993087571086, "learning_rate": 0.0009971227312940826, "loss": 4.2585, "step": 557 }, { "epoch": 0.06, "grad_norm": 2.490189204673003, "learning_rate": 0.0009971028042353844, "loss": 4.149, "step": 558 }, { "epoch": 0.06, "grad_norm": 2.3943923707266426, "learning_rate": 0.00099708280861083, "loss": 4.2182, "step": 559 }, { "epoch": 0.06, "grad_norm": 8.294620444891299, "learning_rate": 0.0009970627444231776, "loss": 4.1478, "step": 560 }, { "epoch": 0.06, "grad_norm": 2.099022241482801, "learning_rate": 0.000997042611675194, "loss": 3.9196, "step": 561 }, { "epoch": 0.06, "grad_norm": 3.5327949908732244, "learning_rate": 0.0009970224103696568, "loss": 4.119, "step": 562 }, { "epoch": 0.06, "grad_norm": 1.5405980673701214, "learning_rate": 0.0009970021405093523, "loss": 4.2492, "step": 563 }, { "epoch": 0.06, "grad_norm": 1.3770024558445135, "learning_rate": 0.0009969818020970761, "loss": 4.0072, "step": 564 }, { "epoch": 0.06, "grad_norm": 1.9389978603214046, "learning_rate": 0.0009969613951356338, "loss": 4.1968, "step": 565 }, { "epoch": 0.06, "grad_norm": 1.638652640315646, "learning_rate": 0.0009969409196278398, "loss": 4.1112, "step": 566 }, { "epoch": 0.07, "grad_norm": 1.930863377097848, "learning_rate": 0.0009969203755765186, "loss": 4.2138, "step": 567 }, { "epoch": 0.07, "grad_norm": 5.791744705846251, "learning_rate": 0.0009968997629845038, "loss": 4.1767, "step": 568 }, { "epoch": 0.07, "grad_norm": 2.3561504351419083, "learning_rate": 0.0009968790818546383, "loss": 4.0966, "step": 569 }, { "epoch": 0.07, "grad_norm": 4.2937126443027225, "learning_rate": 0.000996858332189775, "loss": 4.1656, "step": 570 }, { "epoch": 0.07, "grad_norm": 1.6294494067422438, "learning_rate": 0.0009968375139927756, "loss": 4.0909, "step": 571 }, { "epoch": 0.07, "grad_norm": 1.8781487043247158, "learning_rate": 0.000996816627266512, "loss": 4.4467, "step": 572 }, { "epoch": 0.07, "grad_norm": 1.8872596414079694, "learning_rate": 0.0009967956720138647, "loss": 4.0405, "step": 573 }, { "epoch": 0.07, "grad_norm": 1.6633805768612884, "learning_rate": 0.0009967746482377243, "loss": 4.2121, "step": 574 }, { "epoch": 0.07, "grad_norm": 14.626449747268204, "learning_rate": 0.0009967535559409905, "loss": 4.1391, "step": 575 }, { "epoch": 0.07, "grad_norm": 1.618113444728313, "learning_rate": 0.0009967323951265725, "loss": 4.1785, "step": 576 }, { "epoch": 0.07, "grad_norm": 1.8767844547774968, "learning_rate": 0.0009967111657973892, "loss": 4.0174, "step": 577 }, { "epoch": 0.07, "grad_norm": 1.8698572991517326, "learning_rate": 0.000996689867956369, "loss": 4.26, "step": 578 }, { "epoch": 0.07, "grad_norm": 1.8675704301984564, "learning_rate": 0.0009966685016064491, "loss": 4.3081, "step": 579 }, { "epoch": 0.07, "grad_norm": 1.517905546119349, "learning_rate": 0.0009966470667505767, "loss": 4.2606, "step": 580 }, { "epoch": 0.07, "grad_norm": 7.942138784916919, "learning_rate": 0.0009966255633917086, "loss": 4.1734, "step": 581 }, { "epoch": 0.07, "grad_norm": 3.109881154962878, "learning_rate": 0.0009966039915328105, "loss": 4.3322, "step": 582 }, { "epoch": 0.07, "grad_norm": 2.6260213774034264, "learning_rate": 0.0009965823511768578, "loss": 4.2122, "step": 583 }, { "epoch": 0.07, "grad_norm": 1.59963940916314, "learning_rate": 0.0009965606423268355, "loss": 4.3441, "step": 584 }, { "epoch": 0.07, "grad_norm": 1.5827052836385467, "learning_rate": 0.000996538864985738, "loss": 4.2994, "step": 585 }, { "epoch": 0.07, "grad_norm": 2.0063540937529667, "learning_rate": 0.0009965170191565688, "loss": 4.2068, "step": 586 }, { "epoch": 0.07, "grad_norm": 2.3005152691975743, "learning_rate": 0.0009964951048423414, "loss": 4.2493, "step": 587 }, { "epoch": 0.07, "grad_norm": 1.9998032245088222, "learning_rate": 0.0009964731220460784, "loss": 4.2832, "step": 588 }, { "epoch": 0.07, "grad_norm": 2.1301042358022464, "learning_rate": 0.000996451070770812, "loss": 4.4335, "step": 589 }, { "epoch": 0.07, "grad_norm": 3.483420693736498, "learning_rate": 0.0009964289510195831, "loss": 4.3237, "step": 590 }, { "epoch": 0.07, "grad_norm": 1.8218578874465572, "learning_rate": 0.0009964067627954436, "loss": 4.2681, "step": 591 }, { "epoch": 0.07, "grad_norm": 3.2695489788337544, "learning_rate": 0.0009963845061014534, "loss": 4.3798, "step": 592 }, { "epoch": 0.07, "grad_norm": 2.3136672244226433, "learning_rate": 0.0009963621809406826, "loss": 4.4811, "step": 593 }, { "epoch": 0.07, "grad_norm": 1.8833756760352607, "learning_rate": 0.0009963397873162107, "loss": 4.1932, "step": 594 }, { "epoch": 0.07, "grad_norm": 2.443851108751262, "learning_rate": 0.0009963173252311257, "loss": 4.6135, "step": 595 }, { "epoch": 0.07, "grad_norm": 7.727447223499446, "learning_rate": 0.0009962947946885268, "loss": 4.2424, "step": 596 }, { "epoch": 0.07, "grad_norm": 1.5240964267161377, "learning_rate": 0.000996272195691521, "loss": 4.2512, "step": 597 }, { "epoch": 0.07, "grad_norm": 5.841290162058688, "learning_rate": 0.0009962495282432255, "loss": 4.2481, "step": 598 }, { "epoch": 0.07, "grad_norm": 5.125850925921278, "learning_rate": 0.0009962267923467672, "loss": 4.2708, "step": 599 }, { "epoch": 0.07, "grad_norm": 3.2468804773755444, "learning_rate": 0.0009962039880052817, "loss": 4.3985, "step": 600 }, { "epoch": 0.07, "grad_norm": 2.3778452385039732, "learning_rate": 0.0009961811152219148, "loss": 4.4049, "step": 601 }, { "epoch": 0.07, "grad_norm": 1.7369715892114377, "learning_rate": 0.0009961581739998209, "loss": 4.2272, "step": 602 }, { "epoch": 0.07, "grad_norm": 2.4163850075245925, "learning_rate": 0.0009961351643421646, "loss": 4.4035, "step": 603 }, { "epoch": 0.07, "grad_norm": 1.8536560949329932, "learning_rate": 0.0009961120862521195, "loss": 4.2444, "step": 604 }, { "epoch": 0.07, "grad_norm": 1.894483838812162, "learning_rate": 0.000996088939732869, "loss": 4.0956, "step": 605 }, { "epoch": 0.07, "grad_norm": 1.5356493966175457, "learning_rate": 0.0009960657247876056, "loss": 4.3695, "step": 606 }, { "epoch": 0.07, "grad_norm": 3.5693583489940317, "learning_rate": 0.000996042441419531, "loss": 4.2233, "step": 607 }, { "epoch": 0.07, "grad_norm": 1.7691872791034506, "learning_rate": 0.0009960190896318572, "loss": 4.0849, "step": 608 }, { "epoch": 0.07, "grad_norm": 1.9034357939220619, "learning_rate": 0.0009959956694278052, "loss": 4.0435, "step": 609 }, { "epoch": 0.07, "grad_norm": 1.5325370500536477, "learning_rate": 0.000995972180810605, "loss": 4.022, "step": 610 }, { "epoch": 0.07, "grad_norm": 1.6320464420383207, "learning_rate": 0.0009959486237834964, "loss": 3.9626, "step": 611 }, { "epoch": 0.07, "grad_norm": 2.5971150924266686, "learning_rate": 0.0009959249983497289, "loss": 4.3555, "step": 612 }, { "epoch": 0.07, "grad_norm": 2.2023426282545433, "learning_rate": 0.0009959013045125612, "loss": 4.1113, "step": 613 }, { "epoch": 0.07, "grad_norm": 2.2811148290064023, "learning_rate": 0.000995877542275261, "loss": 4.0316, "step": 614 }, { "epoch": 0.07, "grad_norm": 2.0133575701991147, "learning_rate": 0.0009958537116411064, "loss": 4.1478, "step": 615 }, { "epoch": 0.07, "grad_norm": 1.6312393649543804, "learning_rate": 0.000995829812613384, "loss": 4.2748, "step": 616 }, { "epoch": 0.07, "grad_norm": 1.6646343951920706, "learning_rate": 0.0009958058451953902, "loss": 4.1427, "step": 617 }, { "epoch": 0.07, "grad_norm": 2.583598781245373, "learning_rate": 0.0009957818093904313, "loss": 4.2103, "step": 618 }, { "epoch": 0.07, "grad_norm": 2.5056116984631913, "learning_rate": 0.000995757705201822, "loss": 4.2682, "step": 619 }, { "epoch": 0.07, "grad_norm": 1.763099457100209, "learning_rate": 0.0009957335326328874, "loss": 4.3384, "step": 620 }, { "epoch": 0.07, "grad_norm": 2.4799361809786657, "learning_rate": 0.0009957092916869613, "loss": 4.2228, "step": 621 }, { "epoch": 0.07, "grad_norm": 1.8739950692834075, "learning_rate": 0.0009956849823673877, "loss": 4.0103, "step": 622 }, { "epoch": 0.07, "grad_norm": 3.9932518164258224, "learning_rate": 0.0009956606046775192, "loss": 4.2678, "step": 623 }, { "epoch": 0.07, "grad_norm": 1.9654273013535766, "learning_rate": 0.0009956361586207186, "loss": 4.0602, "step": 624 }, { "epoch": 0.07, "grad_norm": 2.5784187999283485, "learning_rate": 0.0009956116442003575, "loss": 4.2358, "step": 625 }, { "epoch": 0.07, "grad_norm": 1.5145117531966559, "learning_rate": 0.0009955870614198174, "loss": 3.9762, "step": 626 }, { "epoch": 0.07, "grad_norm": 1.9447115037968803, "learning_rate": 0.000995562410282489, "loss": 4.3275, "step": 627 }, { "epoch": 0.07, "grad_norm": 3.7619941459243282, "learning_rate": 0.0009955376907917722, "loss": 4.0099, "step": 628 }, { "epoch": 0.07, "grad_norm": 2.730349569457537, "learning_rate": 0.0009955129029510768, "loss": 4.2631, "step": 629 }, { "epoch": 0.07, "grad_norm": 1.8009487627892025, "learning_rate": 0.0009954880467638219, "loss": 4.2296, "step": 630 }, { "epoch": 0.07, "grad_norm": 1.6766582943336574, "learning_rate": 0.0009954631222334356, "loss": 4.261, "step": 631 }, { "epoch": 0.07, "grad_norm": 1.8044879831624756, "learning_rate": 0.0009954381293633561, "loss": 4.2813, "step": 632 }, { "epoch": 0.07, "grad_norm": 1.8032839691641127, "learning_rate": 0.0009954130681570305, "loss": 4.143, "step": 633 }, { "epoch": 0.07, "grad_norm": 2.1728947228530937, "learning_rate": 0.0009953879386179157, "loss": 4.4043, "step": 634 }, { "epoch": 0.07, "grad_norm": 1.7272339233637788, "learning_rate": 0.0009953627407494777, "loss": 4.2593, "step": 635 }, { "epoch": 0.07, "grad_norm": 3.9634826256171722, "learning_rate": 0.000995337474555192, "loss": 4.0655, "step": 636 }, { "epoch": 0.07, "grad_norm": 2.1675539457130757, "learning_rate": 0.0009953121400385438, "loss": 4.0158, "step": 637 }, { "epoch": 0.07, "grad_norm": 2.55258611059383, "learning_rate": 0.0009952867372030273, "loss": 4.2224, "step": 638 }, { "epoch": 0.07, "grad_norm": 1.4936920879296898, "learning_rate": 0.0009952612660521466, "loss": 4.1504, "step": 639 }, { "epoch": 0.07, "grad_norm": 2.1607837665178105, "learning_rate": 0.0009952357265894146, "loss": 4.0826, "step": 640 }, { "epoch": 0.07, "grad_norm": 1.4878934355875717, "learning_rate": 0.000995210118818354, "loss": 4.2111, "step": 641 }, { "epoch": 0.07, "grad_norm": 1.4641771260355996, "learning_rate": 0.0009951844427424973, "loss": 4.1426, "step": 642 }, { "epoch": 0.07, "grad_norm": 4.705371414196986, "learning_rate": 0.0009951586983653858, "loss": 4.1167, "step": 643 }, { "epoch": 0.07, "grad_norm": 1.4433656253574085, "learning_rate": 0.0009951328856905703, "loss": 4.0358, "step": 644 }, { "epoch": 0.07, "grad_norm": 3.450422403247797, "learning_rate": 0.0009951070047216116, "loss": 4.2028, "step": 645 }, { "epoch": 0.07, "grad_norm": 2.600762399325895, "learning_rate": 0.000995081055462079, "loss": 4.1877, "step": 646 }, { "epoch": 0.07, "grad_norm": 3.2570833918184467, "learning_rate": 0.0009950550379155519, "loss": 4.0667, "step": 647 }, { "epoch": 0.07, "grad_norm": 1.579248260090989, "learning_rate": 0.000995028952085619, "loss": 4.0954, "step": 648 }, { "epoch": 0.07, "grad_norm": 2.7760186120143513, "learning_rate": 0.0009950027979758781, "loss": 4.0575, "step": 649 }, { "epoch": 0.07, "grad_norm": 2.276675031146847, "learning_rate": 0.0009949765755899369, "loss": 4.2075, "step": 650 }, { "epoch": 0.07, "grad_norm": 2.130365431947036, "learning_rate": 0.0009949502849314123, "loss": 4.2476, "step": 651 }, { "epoch": 0.07, "grad_norm": 4.149523073498354, "learning_rate": 0.0009949239260039304, "loss": 4.2608, "step": 652 }, { "epoch": 0.07, "grad_norm": 2.261015719515799, "learning_rate": 0.0009948974988111272, "loss": 4.0788, "step": 653 }, { "epoch": 0.07, "grad_norm": 2.2353527542245377, "learning_rate": 0.0009948710033566475, "loss": 3.9486, "step": 654 }, { "epoch": 0.08, "grad_norm": 2.0264450641185165, "learning_rate": 0.000994844439644146, "loss": 4.0133, "step": 655 }, { "epoch": 0.08, "grad_norm": 1.5930066386604713, "learning_rate": 0.0009948178076772867, "loss": 4.125, "step": 656 }, { "epoch": 0.08, "grad_norm": 1.6671940601079251, "learning_rate": 0.0009947911074597428, "loss": 4.2969, "step": 657 }, { "epoch": 0.08, "grad_norm": 1.9048775750598588, "learning_rate": 0.0009947643389951973, "loss": 4.3319, "step": 658 }, { "epoch": 0.08, "grad_norm": 2.3279909476391536, "learning_rate": 0.0009947375022873422, "loss": 4.1367, "step": 659 }, { "epoch": 0.08, "grad_norm": 1.6630175441269184, "learning_rate": 0.0009947105973398794, "loss": 4.0271, "step": 660 }, { "epoch": 0.08, "grad_norm": 1.8733742540325784, "learning_rate": 0.0009946836241565195, "loss": 4.4839, "step": 661 }, { "epoch": 0.08, "grad_norm": 1.3614379211115837, "learning_rate": 0.0009946565827409833, "loss": 4.028, "step": 662 }, { "epoch": 0.08, "grad_norm": 3.2318834174086395, "learning_rate": 0.0009946294730970005, "loss": 4.3107, "step": 663 }, { "epoch": 0.08, "grad_norm": 1.9264791182269447, "learning_rate": 0.0009946022952283106, "loss": 4.0804, "step": 664 }, { "epoch": 0.08, "grad_norm": 1.3858248901181287, "learning_rate": 0.0009945750491386616, "loss": 4.2127, "step": 665 }, { "epoch": 0.08, "grad_norm": 2.615187478988594, "learning_rate": 0.0009945477348318123, "loss": 4.4193, "step": 666 }, { "epoch": 0.08, "grad_norm": 1.3066164462862055, "learning_rate": 0.00099452035231153, "loss": 4.2361, "step": 667 }, { "epoch": 0.08, "grad_norm": 3.2657794602237598, "learning_rate": 0.0009944929015815913, "loss": 4.1893, "step": 668 }, { "epoch": 0.08, "grad_norm": 1.3495378134136684, "learning_rate": 0.0009944653826457828, "loss": 4.0479, "step": 669 }, { "epoch": 0.08, "grad_norm": 2.198093552915661, "learning_rate": 0.0009944377955079004, "loss": 4.094, "step": 670 }, { "epoch": 0.08, "grad_norm": 1.545953233131494, "learning_rate": 0.0009944101401717486, "loss": 4.0587, "step": 671 }, { "epoch": 0.08, "grad_norm": 1.5739775183187246, "learning_rate": 0.0009943824166411424, "loss": 4.0339, "step": 672 }, { "epoch": 0.08, "grad_norm": 3.8165595196877913, "learning_rate": 0.0009943546249199056, "loss": 4.0908, "step": 673 }, { "epoch": 0.08, "grad_norm": 1.404980398724772, "learning_rate": 0.0009943267650118716, "loss": 3.8289, "step": 674 }, { "epoch": 0.08, "grad_norm": 1.5534329138041343, "learning_rate": 0.0009942988369208829, "loss": 4.1591, "step": 675 }, { "epoch": 0.08, "grad_norm": 1.3782346297812145, "learning_rate": 0.000994270840650792, "loss": 3.8774, "step": 676 }, { "epoch": 0.08, "grad_norm": 1.9689757221263482, "learning_rate": 0.0009942427762054604, "loss": 4.0182, "step": 677 }, { "epoch": 0.08, "grad_norm": 1.6388120724365827, "learning_rate": 0.0009942146435887589, "loss": 4.0247, "step": 678 }, { "epoch": 0.08, "grad_norm": 2.7022461020912822, "learning_rate": 0.0009941864428045677, "loss": 4.3061, "step": 679 }, { "epoch": 0.08, "grad_norm": 1.4036826513721399, "learning_rate": 0.0009941581738567768, "loss": 4.2682, "step": 680 }, { "epoch": 0.08, "grad_norm": 1.6438828672318813, "learning_rate": 0.0009941298367492854, "loss": 4.3206, "step": 681 }, { "epoch": 0.08, "grad_norm": 1.9060765844969223, "learning_rate": 0.0009941014314860021, "loss": 4.2545, "step": 682 }, { "epoch": 0.08, "grad_norm": 1.663851978049675, "learning_rate": 0.0009940729580708448, "loss": 4.188, "step": 683 }, { "epoch": 0.08, "grad_norm": 1.8774721282338387, "learning_rate": 0.0009940444165077408, "loss": 4.2216, "step": 684 }, { "epoch": 0.08, "grad_norm": 1.6586895265799437, "learning_rate": 0.0009940158068006267, "loss": 4.241, "step": 685 }, { "epoch": 0.08, "grad_norm": 1.426418290088738, "learning_rate": 0.0009939871289534488, "loss": 4.1306, "step": 686 }, { "epoch": 0.08, "grad_norm": 1.6730196247514066, "learning_rate": 0.0009939583829701628, "loss": 4.07, "step": 687 }, { "epoch": 0.08, "grad_norm": 1.7847286391960653, "learning_rate": 0.0009939295688547337, "loss": 4.1507, "step": 688 }, { "epoch": 0.08, "grad_norm": 1.8442838104702075, "learning_rate": 0.0009939006866111356, "loss": 4.4061, "step": 689 }, { "epoch": 0.08, "grad_norm": 3.0998736098082627, "learning_rate": 0.0009938717362433524, "loss": 4.144, "step": 690 }, { "epoch": 0.08, "grad_norm": 3.0937603144936294, "learning_rate": 0.0009938427177553773, "loss": 4.2693, "step": 691 }, { "epoch": 0.08, "grad_norm": 1.6394566084453566, "learning_rate": 0.0009938136311512127, "loss": 4.1956, "step": 692 }, { "epoch": 0.08, "grad_norm": 1.359213797224181, "learning_rate": 0.0009937844764348707, "loss": 4.002, "step": 693 }, { "epoch": 0.08, "grad_norm": 5.199705922061841, "learning_rate": 0.0009937552536103727, "loss": 4.1991, "step": 694 }, { "epoch": 0.08, "grad_norm": 2.437999262520451, "learning_rate": 0.000993725962681749, "loss": 4.164, "step": 695 }, { "epoch": 0.08, "grad_norm": 1.1850843372591673, "learning_rate": 0.0009936966036530402, "loss": 3.9344, "step": 696 }, { "epoch": 0.08, "grad_norm": 2.132867346494711, "learning_rate": 0.0009936671765282956, "loss": 4.3679, "step": 697 }, { "epoch": 0.08, "grad_norm": 1.7676011710258328, "learning_rate": 0.0009936376813115741, "loss": 3.8544, "step": 698 }, { "epoch": 0.08, "grad_norm": 1.6040325207499742, "learning_rate": 0.000993608118006944, "loss": 4.1579, "step": 699 }, { "epoch": 0.08, "grad_norm": 1.7840659512618573, "learning_rate": 0.0009935784866184833, "loss": 4.1892, "step": 700 }, { "epoch": 0.08, "grad_norm": 1.2547363770340607, "learning_rate": 0.0009935487871502787, "loss": 4.1591, "step": 701 }, { "epoch": 0.08, "grad_norm": 4.148855007413094, "learning_rate": 0.0009935190196064267, "loss": 4.0047, "step": 702 }, { "epoch": 0.08, "grad_norm": 1.4394707201766952, "learning_rate": 0.0009934891839910333, "loss": 4.228, "step": 703 }, { "epoch": 0.08, "grad_norm": 3.238129732348213, "learning_rate": 0.0009934592803082138, "loss": 4.0628, "step": 704 }, { "epoch": 0.08, "grad_norm": 1.8710782938609463, "learning_rate": 0.0009934293085620929, "loss": 4.4517, "step": 705 }, { "epoch": 0.08, "grad_norm": 9.32057514326992, "learning_rate": 0.0009933992687568044, "loss": 3.9622, "step": 706 }, { "epoch": 0.08, "grad_norm": 1.54767638565984, "learning_rate": 0.0009933691608964917, "loss": 4.2051, "step": 707 }, { "epoch": 0.08, "grad_norm": 6.57251529350329, "learning_rate": 0.0009933389849853078, "loss": 4.0941, "step": 708 }, { "epoch": 0.08, "grad_norm": 4.6949209911194485, "learning_rate": 0.0009933087410274148, "loss": 4.2058, "step": 709 }, { "epoch": 0.08, "grad_norm": 1.4872210790291271, "learning_rate": 0.0009932784290269843, "loss": 4.3284, "step": 710 }, { "epoch": 0.08, "grad_norm": 7.039358233839792, "learning_rate": 0.0009932480489881974, "loss": 4.1659, "step": 711 }, { "epoch": 0.08, "grad_norm": 1.5309905551698293, "learning_rate": 0.0009932176009152442, "loss": 4.3764, "step": 712 }, { "epoch": 0.08, "grad_norm": 2.3741935009221193, "learning_rate": 0.0009931870848123245, "loss": 4.3177, "step": 713 }, { "epoch": 0.08, "grad_norm": 1.6026087543527325, "learning_rate": 0.0009931565006836476, "loss": 4.1264, "step": 714 }, { "epoch": 0.08, "grad_norm": 2.3784415443469045, "learning_rate": 0.0009931258485334315, "loss": 4.3177, "step": 715 }, { "epoch": 0.08, "grad_norm": 3.521722221514614, "learning_rate": 0.0009930951283659048, "loss": 3.9372, "step": 716 }, { "epoch": 0.08, "grad_norm": 4.150274239706253, "learning_rate": 0.0009930643401853043, "loss": 4.0893, "step": 717 }, { "epoch": 0.08, "grad_norm": 2.0078243843614643, "learning_rate": 0.0009930334839958765, "loss": 4.1692, "step": 718 }, { "epoch": 0.08, "grad_norm": 1.425596482893849, "learning_rate": 0.000993002559801878, "loss": 4.1342, "step": 719 }, { "epoch": 0.08, "grad_norm": 2.1708120851171464, "learning_rate": 0.0009929715676075736, "loss": 4.0451, "step": 720 }, { "epoch": 0.08, "grad_norm": 4.895670305255316, "learning_rate": 0.0009929405074172383, "loss": 4.4976, "step": 721 }, { "epoch": 0.08, "grad_norm": 1.4950444248800692, "learning_rate": 0.0009929093792351567, "loss": 4.0189, "step": 722 }, { "epoch": 0.08, "grad_norm": 2.6988180516334936, "learning_rate": 0.0009928781830656215, "loss": 4.3541, "step": 723 }, { "epoch": 0.08, "grad_norm": 2.6253414798275876, "learning_rate": 0.0009928469189129363, "loss": 4.3916, "step": 724 }, { "epoch": 0.08, "grad_norm": 2.6134098500656573, "learning_rate": 0.0009928155867814131, "loss": 4.193, "step": 725 }, { "epoch": 0.08, "grad_norm": 1.8137965934267906, "learning_rate": 0.0009927841866753735, "loss": 4.2869, "step": 726 }, { "epoch": 0.08, "grad_norm": 2.3430626519886903, "learning_rate": 0.000992752718599149, "loss": 4.4761, "step": 727 }, { "epoch": 0.08, "grad_norm": 2.963148481677119, "learning_rate": 0.0009927211825570793, "loss": 4.1003, "step": 728 }, { "epoch": 0.08, "grad_norm": 1.7125356956327422, "learning_rate": 0.000992689578553515, "loss": 4.0978, "step": 729 }, { "epoch": 0.08, "grad_norm": 4.164090748182801, "learning_rate": 0.0009926579065928144, "loss": 3.9676, "step": 730 }, { "epoch": 0.08, "grad_norm": 1.6672639039175612, "learning_rate": 0.000992626166679347, "loss": 4.2166, "step": 731 }, { "epoch": 0.08, "grad_norm": 3.325034862700568, "learning_rate": 0.0009925943588174897, "loss": 4.1478, "step": 732 }, { "epoch": 0.08, "grad_norm": 2.0908938042937466, "learning_rate": 0.0009925624830116305, "loss": 4.3866, "step": 733 }, { "epoch": 0.08, "grad_norm": 2.6243891789347322, "learning_rate": 0.000992530539266166, "loss": 4.2286, "step": 734 }, { "epoch": 0.08, "grad_norm": 2.493282875693308, "learning_rate": 0.0009924985275855018, "loss": 4.3163, "step": 735 }, { "epoch": 0.08, "grad_norm": 7.048672435349092, "learning_rate": 0.000992466447974054, "loss": 4.0767, "step": 736 }, { "epoch": 0.08, "grad_norm": 4.263664385719958, "learning_rate": 0.0009924343004362466, "loss": 4.0472, "step": 737 }, { "epoch": 0.08, "grad_norm": 2.8502603775428574, "learning_rate": 0.0009924020849765142, "loss": 4.1134, "step": 738 }, { "epoch": 0.08, "grad_norm": 2.019518653989259, "learning_rate": 0.0009923698015993003, "loss": 3.9955, "step": 739 }, { "epoch": 0.08, "grad_norm": 7.415339956788553, "learning_rate": 0.0009923374503090577, "loss": 4.5334, "step": 740 }, { "epoch": 0.08, "grad_norm": 4.935130365716812, "learning_rate": 0.0009923050311102487, "loss": 4.1967, "step": 741 }, { "epoch": 0.09, "grad_norm": 2.745716345314492, "learning_rate": 0.0009922725440073446, "loss": 4.4008, "step": 742 }, { "epoch": 0.09, "grad_norm": 2.860101068810814, "learning_rate": 0.0009922399890048268, "loss": 3.9512, "step": 743 }, { "epoch": 0.09, "grad_norm": 3.158279857202506, "learning_rate": 0.0009922073661071855, "loss": 4.1149, "step": 744 }, { "epoch": 0.09, "grad_norm": 2.791582018566665, "learning_rate": 0.0009921746753189203, "loss": 4.2252, "step": 745 }, { "epoch": 0.09, "grad_norm": 1.3238490166096701, "learning_rate": 0.0009921419166445404, "loss": 4.1235, "step": 746 }, { "epoch": 0.09, "grad_norm": 2.7171155566558083, "learning_rate": 0.0009921090900885641, "loss": 4.2676, "step": 747 }, { "epoch": 0.09, "grad_norm": 3.1138251099118706, "learning_rate": 0.0009920761956555193, "loss": 4.0925, "step": 748 }, { "epoch": 0.09, "grad_norm": 1.983470727051293, "learning_rate": 0.0009920432333499433, "loss": 4.1856, "step": 749 }, { "epoch": 0.09, "grad_norm": 3.8263031262313496, "learning_rate": 0.0009920102031763822, "loss": 4.2059, "step": 750 }, { "epoch": 0.09, "grad_norm": 2.4803042827859167, "learning_rate": 0.0009919771051393922, "loss": 4.0793, "step": 751 }, { "epoch": 0.09, "grad_norm": 3.1217377314032855, "learning_rate": 0.0009919439392435385, "loss": 4.0334, "step": 752 }, { "epoch": 0.09, "grad_norm": 2.863013461834579, "learning_rate": 0.0009919107054933956, "loss": 4.3082, "step": 753 }, { "epoch": 0.09, "grad_norm": 1.5681152833171614, "learning_rate": 0.0009918774038935477, "loss": 4.3405, "step": 754 }, { "epoch": 0.09, "grad_norm": 1.6475644127078022, "learning_rate": 0.000991844034448588, "loss": 4.0021, "step": 755 }, { "epoch": 0.09, "grad_norm": 1.8984409427187539, "learning_rate": 0.000991810597163119, "loss": 4.0574, "step": 756 }, { "epoch": 0.09, "grad_norm": 1.58939146630792, "learning_rate": 0.000991777092041753, "loss": 3.9337, "step": 757 }, { "epoch": 0.09, "grad_norm": 6.777224982143146, "learning_rate": 0.0009917435190891111, "loss": 4.0616, "step": 758 }, { "epoch": 0.09, "grad_norm": 3.0770765445667645, "learning_rate": 0.0009917098783098243, "loss": 4.1374, "step": 759 }, { "epoch": 0.09, "grad_norm": 1.7425690799391629, "learning_rate": 0.0009916761697085327, "loss": 4.2819, "step": 760 }, { "epoch": 0.09, "grad_norm": 1.5513955926682097, "learning_rate": 0.0009916423932898857, "loss": 4.1649, "step": 761 }, { "epoch": 0.09, "grad_norm": 1.5661020418110607, "learning_rate": 0.0009916085490585423, "loss": 4.2489, "step": 762 }, { "epoch": 0.09, "grad_norm": 1.56959799401122, "learning_rate": 0.0009915746370191701, "loss": 4.1859, "step": 763 }, { "epoch": 0.09, "grad_norm": 1.505322579394529, "learning_rate": 0.0009915406571764471, "loss": 4.173, "step": 764 }, { "epoch": 0.09, "grad_norm": 2.6263996517435877, "learning_rate": 0.0009915066095350603, "loss": 4.3217, "step": 765 }, { "epoch": 0.09, "grad_norm": 1.5956846875850819, "learning_rate": 0.0009914724940997053, "loss": 4.3342, "step": 766 }, { "epoch": 0.09, "grad_norm": 1.2321829796423942, "learning_rate": 0.0009914383108750883, "loss": 3.9887, "step": 767 }, { "epoch": 0.09, "grad_norm": 1.6435208024145433, "learning_rate": 0.000991404059865924, "loss": 4.2994, "step": 768 }, { "epoch": 0.09, "grad_norm": 2.59087168033926, "learning_rate": 0.0009913697410769366, "loss": 4.0265, "step": 769 }, { "epoch": 0.09, "grad_norm": 1.6945448430914578, "learning_rate": 0.0009913353545128597, "loss": 4.1718, "step": 770 }, { "epoch": 0.09, "grad_norm": 1.7292578860620342, "learning_rate": 0.0009913009001784364, "loss": 4.3701, "step": 771 }, { "epoch": 0.09, "grad_norm": 1.9766021220287124, "learning_rate": 0.0009912663780784188, "loss": 4.2994, "step": 772 }, { "epoch": 0.09, "grad_norm": 1.8254900755138068, "learning_rate": 0.000991231788217569, "loss": 3.9365, "step": 773 }, { "epoch": 0.09, "grad_norm": 1.586410849650027, "learning_rate": 0.0009911971306006575, "loss": 4.2367, "step": 774 }, { "epoch": 0.09, "grad_norm": 1.9304860298566036, "learning_rate": 0.000991162405232465, "loss": 4.1254, "step": 775 }, { "epoch": 0.09, "grad_norm": 2.690606637687549, "learning_rate": 0.0009911276121177812, "loss": 4.4039, "step": 776 }, { "epoch": 0.09, "grad_norm": 1.859797871038602, "learning_rate": 0.0009910927512614051, "loss": 4.2972, "step": 777 }, { "epoch": 0.09, "grad_norm": 1.4807019178862428, "learning_rate": 0.000991057822668145, "loss": 4.1683, "step": 778 }, { "epoch": 0.09, "grad_norm": 1.410080870844395, "learning_rate": 0.0009910228263428186, "loss": 4.1035, "step": 779 }, { "epoch": 0.09, "grad_norm": 1.4357253253695492, "learning_rate": 0.000990987762290253, "loss": 3.9886, "step": 780 }, { "epoch": 0.09, "grad_norm": 1.4765043354248306, "learning_rate": 0.0009909526305152848, "loss": 4.2546, "step": 781 }, { "epoch": 0.09, "grad_norm": 1.52803621062874, "learning_rate": 0.0009909174310227596, "loss": 3.9914, "step": 782 }, { "epoch": 0.09, "grad_norm": 2.534073155699533, "learning_rate": 0.0009908821638175325, "loss": 4.0564, "step": 783 }, { "epoch": 0.09, "grad_norm": 1.9835273525298232, "learning_rate": 0.000990846828904468, "loss": 4.3435, "step": 784 }, { "epoch": 0.09, "grad_norm": 1.3910984032738951, "learning_rate": 0.0009908114262884397, "loss": 4.3049, "step": 785 }, { "epoch": 0.09, "grad_norm": 2.257033092148284, "learning_rate": 0.0009907759559743311, "loss": 4.0916, "step": 786 }, { "epoch": 0.09, "grad_norm": 1.3968863500146882, "learning_rate": 0.0009907404179670342, "loss": 4.0653, "step": 787 }, { "epoch": 0.09, "grad_norm": 1.438859813251157, "learning_rate": 0.000990704812271451, "loss": 4.2955, "step": 788 }, { "epoch": 0.09, "grad_norm": 2.2667941961354074, "learning_rate": 0.0009906691388924928, "loss": 4.3845, "step": 789 }, { "epoch": 0.09, "grad_norm": 2.0350798052845342, "learning_rate": 0.0009906333978350799, "loss": 3.975, "step": 790 }, { "epoch": 0.09, "grad_norm": 1.5444468262184543, "learning_rate": 0.000990597589104142, "loss": 4.1477, "step": 791 }, { "epoch": 0.09, "grad_norm": 1.9912229961955337, "learning_rate": 0.0009905617127046182, "loss": 3.9488, "step": 792 }, { "epoch": 0.09, "grad_norm": 1.999205567751221, "learning_rate": 0.0009905257686414573, "loss": 4.1048, "step": 793 }, { "epoch": 0.09, "grad_norm": 1.279433840877134, "learning_rate": 0.0009904897569196168, "loss": 4.3206, "step": 794 }, { "epoch": 0.09, "grad_norm": 1.4626698501471354, "learning_rate": 0.0009904536775440641, "loss": 4.0238, "step": 795 }, { "epoch": 0.09, "grad_norm": 1.7311865197098213, "learning_rate": 0.0009904175305197752, "loss": 4.0341, "step": 796 }, { "epoch": 0.09, "grad_norm": 32.09750897099277, "learning_rate": 0.0009903813158517363, "loss": 4.077, "step": 797 }, { "epoch": 0.09, "grad_norm": 1.74866036245473, "learning_rate": 0.0009903450335449423, "loss": 4.1933, "step": 798 }, { "epoch": 0.09, "grad_norm": 3.465574059405322, "learning_rate": 0.0009903086836043978, "loss": 4.2747, "step": 799 }, { "epoch": 0.09, "grad_norm": 1.422156595810747, "learning_rate": 0.0009902722660351166, "loss": 4.1147, "step": 800 }, { "epoch": 0.09, "grad_norm": 2.333491839666736, "learning_rate": 0.0009902357808421218, "loss": 3.9325, "step": 801 }, { "epoch": 0.09, "grad_norm": 1.6375699702168582, "learning_rate": 0.0009901992280304456, "loss": 4.1576, "step": 802 }, { "epoch": 0.09, "grad_norm": 1.61009157474991, "learning_rate": 0.00099016260760513, "loss": 4.1331, "step": 803 }, { "epoch": 0.09, "grad_norm": 1.5114303702179501, "learning_rate": 0.000990125919571226, "loss": 4.0907, "step": 804 }, { "epoch": 0.09, "grad_norm": 1.620671227027203, "learning_rate": 0.000990089163933794, "loss": 4.1283, "step": 805 }, { "epoch": 0.09, "grad_norm": 1.8673499750745683, "learning_rate": 0.000990052340697904, "loss": 4.2822, "step": 806 }, { "epoch": 0.09, "grad_norm": 1.8239378109730022, "learning_rate": 0.0009900154498686349, "loss": 4.3852, "step": 807 }, { "epoch": 0.09, "grad_norm": 1.5571969481588779, "learning_rate": 0.0009899784914510748, "loss": 3.8481, "step": 808 }, { "epoch": 0.09, "grad_norm": 2.6711497630302974, "learning_rate": 0.0009899414654503216, "loss": 4.2058, "step": 809 }, { "epoch": 0.09, "grad_norm": 2.0175112975235594, "learning_rate": 0.0009899043718714826, "loss": 3.9098, "step": 810 }, { "epoch": 0.09, "grad_norm": 1.7170011301503398, "learning_rate": 0.0009898672107196739, "loss": 4.3873, "step": 811 }, { "epoch": 0.09, "grad_norm": 1.55493969281928, "learning_rate": 0.000989829982000021, "loss": 4.2392, "step": 812 }, { "epoch": 0.09, "grad_norm": 1.4171694730376854, "learning_rate": 0.000989792685717659, "loss": 3.8977, "step": 813 }, { "epoch": 0.09, "grad_norm": 1.4517315092668215, "learning_rate": 0.0009897553218777327, "loss": 4.3902, "step": 814 }, { "epoch": 0.09, "grad_norm": 2.4174543047541768, "learning_rate": 0.000989717890485395, "loss": 4.0579, "step": 815 }, { "epoch": 0.09, "grad_norm": 2.6240165463704432, "learning_rate": 0.0009896803915458094, "loss": 4.133, "step": 816 }, { "epoch": 0.09, "grad_norm": 1.3369108657482633, "learning_rate": 0.0009896428250641479, "loss": 4.0656, "step": 817 }, { "epoch": 0.09, "grad_norm": 1.3670700974097, "learning_rate": 0.000989605191045592, "loss": 4.2307, "step": 818 }, { "epoch": 0.09, "grad_norm": 3.0451673649230195, "learning_rate": 0.0009895674894953327, "loss": 4.0643, "step": 819 }, { "epoch": 0.09, "grad_norm": 1.7242471209559656, "learning_rate": 0.0009895297204185706, "loss": 4.173, "step": 820 }, { "epoch": 0.09, "grad_norm": 1.5073010344004656, "learning_rate": 0.0009894918838205145, "loss": 4.2359, "step": 821 }, { "epoch": 0.09, "grad_norm": 2.0074077994258865, "learning_rate": 0.0009894539797063837, "loss": 4.1601, "step": 822 }, { "epoch": 0.09, "grad_norm": 1.793134430459254, "learning_rate": 0.0009894160080814061, "loss": 4.3247, "step": 823 }, { "epoch": 0.09, "grad_norm": 1.8824130569005808, "learning_rate": 0.0009893779689508194, "loss": 4.2753, "step": 824 }, { "epoch": 0.09, "grad_norm": 1.6563378739412518, "learning_rate": 0.0009893398623198703, "loss": 4.0516, "step": 825 }, { "epoch": 0.09, "grad_norm": 3.7061453289107154, "learning_rate": 0.0009893016881938148, "loss": 4.47, "step": 826 }, { "epoch": 0.09, "grad_norm": 1.5456325839268084, "learning_rate": 0.0009892634465779185, "loss": 4.1089, "step": 827 }, { "epoch": 0.09, "grad_norm": 1.868895155214188, "learning_rate": 0.000989225137477456, "loss": 4.0752, "step": 828 }, { "epoch": 0.1, "grad_norm": 1.6764141215503516, "learning_rate": 0.000989186760897711, "loss": 4.1721, "step": 829 }, { "epoch": 0.1, "grad_norm": 2.9712489021127935, "learning_rate": 0.0009891483168439773, "loss": 4.1865, "step": 830 }, { "epoch": 0.1, "grad_norm": 1.6663575257336496, "learning_rate": 0.000989109805321557, "loss": 4.1763, "step": 831 }, { "epoch": 0.1, "grad_norm": 1.3128683640496095, "learning_rate": 0.0009890712263357626, "loss": 4.1873, "step": 832 }, { "epoch": 0.1, "grad_norm": 1.2834978417668657, "learning_rate": 0.000989032579891915, "loss": 4.1794, "step": 833 }, { "epoch": 0.1, "grad_norm": 1.4728997577299825, "learning_rate": 0.000988993865995345, "loss": 4.156, "step": 834 }, { "epoch": 0.1, "grad_norm": 1.518306156950804, "learning_rate": 0.000988955084651392, "loss": 4.175, "step": 835 }, { "epoch": 0.1, "grad_norm": 1.4993741327328471, "learning_rate": 0.0009889162358654056, "loss": 4.1886, "step": 836 }, { "epoch": 0.1, "grad_norm": 1.396175609739692, "learning_rate": 0.000988877319642744, "loss": 4.2441, "step": 837 }, { "epoch": 0.1, "grad_norm": 1.4682446559485016, "learning_rate": 0.000988838335988775, "loss": 4.2163, "step": 838 }, { "epoch": 0.1, "grad_norm": 1.471856930949423, "learning_rate": 0.0009887992849088754, "loss": 4.3202, "step": 839 }, { "epoch": 0.1, "grad_norm": 1.3754504931487959, "learning_rate": 0.000988760166408432, "loss": 4.2168, "step": 840 }, { "epoch": 0.1, "grad_norm": 1.9941856228215262, "learning_rate": 0.0009887209804928404, "loss": 4.1255, "step": 841 }, { "epoch": 0.1, "grad_norm": 2.3320208055618443, "learning_rate": 0.0009886817271675052, "loss": 4.0288, "step": 842 }, { "epoch": 0.1, "grad_norm": 1.3721571943334459, "learning_rate": 0.000988642406437841, "loss": 4.216, "step": 843 }, { "epoch": 0.1, "grad_norm": 1.9964588924841478, "learning_rate": 0.0009886030183092712, "loss": 4.2866, "step": 844 }, { "epoch": 0.1, "grad_norm": 1.3649005738945275, "learning_rate": 0.0009885635627872285, "loss": 4.3362, "step": 845 }, { "epoch": 0.1, "grad_norm": 1.1845987974279126, "learning_rate": 0.0009885240398771554, "loss": 4.0246, "step": 846 }, { "epoch": 0.1, "grad_norm": 1.736380363879416, "learning_rate": 0.0009884844495845029, "loss": 3.9036, "step": 847 }, { "epoch": 0.1, "grad_norm": 1.5168369269512492, "learning_rate": 0.000988444791914732, "loss": 3.966, "step": 848 }, { "epoch": 0.1, "grad_norm": 1.4105584256023633, "learning_rate": 0.0009884050668733126, "loss": 3.914, "step": 849 }, { "epoch": 0.1, "grad_norm": 1.4056703151579113, "learning_rate": 0.0009883652744657244, "loss": 3.9651, "step": 850 }, { "epoch": 0.1, "grad_norm": 1.329598060881227, "learning_rate": 0.0009883254146974554, "loss": 4.0448, "step": 851 }, { "epoch": 0.1, "grad_norm": 1.5602090208102601, "learning_rate": 0.0009882854875740037, "loss": 3.9447, "step": 852 }, { "epoch": 0.1, "grad_norm": 1.8650418016882258, "learning_rate": 0.0009882454931008768, "loss": 4.0408, "step": 853 }, { "epoch": 0.1, "grad_norm": 1.281557417737647, "learning_rate": 0.0009882054312835907, "loss": 4.0568, "step": 854 }, { "epoch": 0.1, "grad_norm": 1.5100294455433731, "learning_rate": 0.0009881653021276715, "loss": 4.1181, "step": 855 }, { "epoch": 0.1, "grad_norm": 7.110449039598737, "learning_rate": 0.0009881251056386541, "loss": 4.1974, "step": 856 }, { "epoch": 0.1, "grad_norm": 1.5537990663497907, "learning_rate": 0.000988084841822083, "loss": 4.0734, "step": 857 }, { "epoch": 0.1, "grad_norm": 1.2763249051380792, "learning_rate": 0.0009880445106835117, "loss": 4.1956, "step": 858 }, { "epoch": 0.1, "grad_norm": 1.6745506051063044, "learning_rate": 0.000988004112228503, "loss": 3.9389, "step": 859 }, { "epoch": 0.1, "grad_norm": 1.1996399382126313, "learning_rate": 0.0009879636464626294, "loss": 4.2508, "step": 860 }, { "epoch": 0.1, "grad_norm": 1.3091183188090632, "learning_rate": 0.0009879231133914721, "loss": 3.927, "step": 861 }, { "epoch": 0.1, "grad_norm": 1.8284985446109874, "learning_rate": 0.000987882513020622, "loss": 4.222, "step": 862 }, { "epoch": 0.1, "grad_norm": 1.3865093986563062, "learning_rate": 0.000987841845355679, "loss": 4.0721, "step": 863 }, { "epoch": 0.1, "grad_norm": 4.998346350395685, "learning_rate": 0.0009878011104022526, "loss": 4.2823, "step": 864 }, { "epoch": 0.1, "grad_norm": 2.6693564066274464, "learning_rate": 0.0009877603081659614, "loss": 4.2432, "step": 865 }, { "epoch": 0.1, "grad_norm": 1.3978977422187508, "learning_rate": 0.0009877194386524334, "loss": 4.0933, "step": 866 }, { "epoch": 0.1, "grad_norm": 1.7512437446522529, "learning_rate": 0.0009876785018673054, "loss": 4.041, "step": 867 }, { "epoch": 0.1, "grad_norm": 1.6903277484709411, "learning_rate": 0.0009876374978162242, "loss": 4.1412, "step": 868 }, { "epoch": 0.1, "grad_norm": 2.5098457306729203, "learning_rate": 0.0009875964265048452, "loss": 4.2508, "step": 869 }, { "epoch": 0.1, "grad_norm": 1.445772392847085, "learning_rate": 0.0009875552879388336, "loss": 4.0041, "step": 870 }, { "epoch": 0.1, "grad_norm": 2.3747390593033653, "learning_rate": 0.000987514082123864, "loss": 4.1408, "step": 871 }, { "epoch": 0.1, "grad_norm": 2.285219958728977, "learning_rate": 0.0009874728090656193, "loss": 4.0732, "step": 872 }, { "epoch": 0.1, "grad_norm": 6.15818899598526, "learning_rate": 0.0009874314687697927, "loss": 4.1792, "step": 873 }, { "epoch": 0.1, "grad_norm": 3.052884273216278, "learning_rate": 0.0009873900612420866, "loss": 4.0166, "step": 874 }, { "epoch": 0.1, "grad_norm": 1.7073777248289508, "learning_rate": 0.0009873485864882116, "loss": 4.2618, "step": 875 }, { "epoch": 0.1, "grad_norm": 2.019571000491364, "learning_rate": 0.000987307044513889, "loss": 4.0095, "step": 876 }, { "epoch": 0.1, "grad_norm": 1.7463988820010106, "learning_rate": 0.0009872654353248486, "loss": 4.1733, "step": 877 }, { "epoch": 0.1, "grad_norm": 1.4357003165261917, "learning_rate": 0.0009872237589268295, "loss": 4.1106, "step": 878 }, { "epoch": 0.1, "grad_norm": 1.8219086703798706, "learning_rate": 0.00098718201532558, "loss": 4.1302, "step": 879 }, { "epoch": 0.1, "grad_norm": 2.0068526247315424, "learning_rate": 0.0009871402045268582, "loss": 4.1941, "step": 880 }, { "epoch": 0.1, "grad_norm": 1.935874623123465, "learning_rate": 0.000987098326536431, "loss": 4.02, "step": 881 }, { "epoch": 0.1, "grad_norm": 2.6172321457425043, "learning_rate": 0.0009870563813600744, "loss": 4.048, "step": 882 }, { "epoch": 0.1, "grad_norm": 1.5168242146067965, "learning_rate": 0.0009870143690035743, "loss": 4.2247, "step": 883 }, { "epoch": 0.1, "grad_norm": 1.361872540458157, "learning_rate": 0.0009869722894727251, "loss": 3.9947, "step": 884 }, { "epoch": 0.1, "grad_norm": 1.5513944597043743, "learning_rate": 0.0009869301427733314, "loss": 4.0396, "step": 885 }, { "epoch": 0.1, "grad_norm": 1.663206677993715, "learning_rate": 0.000986887928911206, "loss": 4.1825, "step": 886 }, { "epoch": 0.1, "grad_norm": 1.3962815432979825, "learning_rate": 0.0009868456478921719, "loss": 4.1016, "step": 887 }, { "epoch": 0.1, "grad_norm": 1.3601482196694634, "learning_rate": 0.0009868032997220608, "loss": 4.1785, "step": 888 }, { "epoch": 0.1, "grad_norm": 1.5727159099927652, "learning_rate": 0.0009867608844067136, "loss": 4.0714, "step": 889 }, { "epoch": 0.1, "grad_norm": 3.408619511973609, "learning_rate": 0.000986718401951981, "loss": 4.1265, "step": 890 }, { "epoch": 0.1, "grad_norm": 1.4587998972012652, "learning_rate": 0.0009866758523637228, "loss": 4.2089, "step": 891 }, { "epoch": 0.1, "grad_norm": 1.5900199971880722, "learning_rate": 0.0009866332356478075, "loss": 3.9907, "step": 892 }, { "epoch": 0.1, "grad_norm": 1.9803060771135164, "learning_rate": 0.000986590551810113, "loss": 4.0381, "step": 893 }, { "epoch": 0.1, "grad_norm": 1.717011093368506, "learning_rate": 0.0009865478008565275, "loss": 4.2757, "step": 894 }, { "epoch": 0.1, "grad_norm": 1.4697052083352828, "learning_rate": 0.0009865049827929475, "loss": 4.0874, "step": 895 }, { "epoch": 0.1, "grad_norm": 1.3169170554076146, "learning_rate": 0.0009864620976252785, "loss": 4.4084, "step": 896 }, { "epoch": 0.1, "grad_norm": 3.503223418433799, "learning_rate": 0.000986419145359436, "loss": 4.1809, "step": 897 }, { "epoch": 0.1, "grad_norm": 1.4616149145951478, "learning_rate": 0.0009863761260013443, "loss": 4.1972, "step": 898 }, { "epoch": 0.1, "grad_norm": 1.3389623483611446, "learning_rate": 0.0009863330395569374, "loss": 4.1365, "step": 899 }, { "epoch": 0.1, "grad_norm": 2.1064198975072026, "learning_rate": 0.000986289886032158, "loss": 4.2181, "step": 900 }, { "epoch": 0.1, "grad_norm": 1.672951834308018, "learning_rate": 0.0009862466654329582, "loss": 4.1912, "step": 901 }, { "epoch": 0.1, "grad_norm": 1.1954580106470967, "learning_rate": 0.0009862033777652997, "loss": 4.1867, "step": 902 }, { "epoch": 0.1, "grad_norm": 3.1181370641541637, "learning_rate": 0.000986160023035153, "loss": 4.1336, "step": 903 }, { "epoch": 0.1, "grad_norm": 2.0854762835686764, "learning_rate": 0.0009861166012484982, "loss": 3.9569, "step": 904 }, { "epoch": 0.1, "grad_norm": 1.432306482984174, "learning_rate": 0.0009860731124113247, "loss": 3.8537, "step": 905 }, { "epoch": 0.1, "grad_norm": 1.232249663303332, "learning_rate": 0.0009860295565296306, "loss": 4.1922, "step": 906 }, { "epoch": 0.1, "grad_norm": 1.4810016570917282, "learning_rate": 0.000985985933609424, "loss": 3.9138, "step": 907 }, { "epoch": 0.1, "grad_norm": 1.3465200259895214, "learning_rate": 0.0009859422436567212, "loss": 4.1411, "step": 908 }, { "epoch": 0.1, "grad_norm": 2.035235109713225, "learning_rate": 0.000985898486677549, "loss": 4.0985, "step": 909 }, { "epoch": 0.1, "grad_norm": 1.9070904012198038, "learning_rate": 0.0009858546626779425, "loss": 4.0797, "step": 910 }, { "epoch": 0.1, "grad_norm": 1.2969523877987432, "learning_rate": 0.0009858107716639464, "loss": 4.1869, "step": 911 }, { "epoch": 0.1, "grad_norm": 1.2721800017742473, "learning_rate": 0.000985766813641615, "loss": 4.0144, "step": 912 }, { "epoch": 0.1, "grad_norm": 1.8432474913858778, "learning_rate": 0.0009857227886170112, "loss": 4.0111, "step": 913 }, { "epoch": 0.1, "grad_norm": 1.2335353539205405, "learning_rate": 0.0009856786965962074, "loss": 3.955, "step": 914 }, { "epoch": 0.1, "grad_norm": 2.2188538037851444, "learning_rate": 0.0009856345375852853, "loss": 4.2648, "step": 915 }, { "epoch": 0.11, "grad_norm": 1.1827077423871535, "learning_rate": 0.0009855903115903357, "loss": 4.1443, "step": 916 }, { "epoch": 0.11, "grad_norm": 1.5811102669157695, "learning_rate": 0.0009855460186174588, "loss": 4.3053, "step": 917 }, { "epoch": 0.11, "grad_norm": 2.4327913041296605, "learning_rate": 0.000985501658672764, "loss": 3.9176, "step": 918 }, { "epoch": 0.11, "grad_norm": 3.3053416901828485, "learning_rate": 0.0009854572317623698, "loss": 4.0525, "step": 919 }, { "epoch": 0.11, "grad_norm": 2.0369316243757916, "learning_rate": 0.0009854127378924043, "loss": 4.0978, "step": 920 }, { "epoch": 0.11, "grad_norm": 6.730127376430728, "learning_rate": 0.0009853681770690043, "loss": 4.0223, "step": 921 }, { "epoch": 0.11, "grad_norm": 1.4068672288197746, "learning_rate": 0.0009853235492983164, "loss": 3.9241, "step": 922 }, { "epoch": 0.11, "grad_norm": 4.024083319414701, "learning_rate": 0.000985278854586496, "loss": 3.9154, "step": 923 }, { "epoch": 0.11, "grad_norm": 1.6107517912992424, "learning_rate": 0.0009852340929397076, "loss": 3.9688, "step": 924 }, { "epoch": 0.11, "grad_norm": 2.360364929430685, "learning_rate": 0.0009851892643641257, "loss": 3.9677, "step": 925 }, { "epoch": 0.11, "grad_norm": 1.6007067477654784, "learning_rate": 0.000985144368865933, "loss": 4.1016, "step": 926 }, { "epoch": 0.11, "grad_norm": 1.565740773322196, "learning_rate": 0.0009850994064513226, "loss": 4.2352, "step": 927 }, { "epoch": 0.11, "grad_norm": 2.0618787522548327, "learning_rate": 0.000985054377126496, "loss": 4.0093, "step": 928 }, { "epoch": 0.11, "grad_norm": 1.3537029275699117, "learning_rate": 0.0009850092808976639, "loss": 4.068, "step": 929 }, { "epoch": 0.11, "grad_norm": 1.257422537789429, "learning_rate": 0.0009849641177710467, "loss": 4.3545, "step": 930 }, { "epoch": 0.11, "grad_norm": 1.9178559311251824, "learning_rate": 0.0009849188877528736, "loss": 4.2681, "step": 931 }, { "epoch": 0.11, "grad_norm": 2.2443681177826975, "learning_rate": 0.0009848735908493834, "loss": 4.1493, "step": 932 }, { "epoch": 0.11, "grad_norm": 1.504529758282073, "learning_rate": 0.0009848282270668238, "loss": 4.2458, "step": 933 }, { "epoch": 0.11, "grad_norm": 1.818373151242836, "learning_rate": 0.000984782796411452, "loss": 3.9899, "step": 934 }, { "epoch": 0.11, "grad_norm": 1.721184700636074, "learning_rate": 0.0009847372988895343, "loss": 4.0558, "step": 935 }, { "epoch": 0.11, "grad_norm": 1.6643417356756316, "learning_rate": 0.000984691734507346, "loss": 4.0702, "step": 936 }, { "epoch": 0.11, "grad_norm": 1.160294064773197, "learning_rate": 0.0009846461032711723, "loss": 3.7598, "step": 937 }, { "epoch": 0.11, "grad_norm": 1.154715007321242, "learning_rate": 0.0009846004051873066, "loss": 4.0973, "step": 938 }, { "epoch": 0.11, "grad_norm": 1.2667389691399629, "learning_rate": 0.0009845546402620523, "loss": 4.0992, "step": 939 }, { "epoch": 0.11, "grad_norm": 2.7929494846930614, "learning_rate": 0.0009845088085017218, "loss": 4.224, "step": 940 }, { "epoch": 0.11, "grad_norm": 2.190236381170359, "learning_rate": 0.000984462909912637, "loss": 4.2211, "step": 941 }, { "epoch": 0.11, "grad_norm": 2.0972319517618883, "learning_rate": 0.0009844169445011282, "loss": 4.254, "step": 942 }, { "epoch": 0.11, "grad_norm": 1.3481123874607785, "learning_rate": 0.0009843709122735358, "loss": 4.1414, "step": 943 }, { "epoch": 0.11, "grad_norm": 1.2421684906029535, "learning_rate": 0.000984324813236209, "loss": 3.9811, "step": 944 }, { "epoch": 0.11, "grad_norm": 1.2961804574619438, "learning_rate": 0.0009842786473955062, "loss": 4.1073, "step": 945 }, { "epoch": 0.11, "grad_norm": 1.3525605128526232, "learning_rate": 0.0009842324147577954, "loss": 3.8978, "step": 946 }, { "epoch": 0.11, "grad_norm": 1.4444857280361005, "learning_rate": 0.0009841861153294534, "loss": 4.1966, "step": 947 }, { "epoch": 0.11, "grad_norm": 1.1789541988072991, "learning_rate": 0.000984139749116866, "loss": 4.1291, "step": 948 }, { "epoch": 0.11, "grad_norm": 1.1234830955145556, "learning_rate": 0.0009840933161264288, "loss": 4.2481, "step": 949 }, { "epoch": 0.11, "grad_norm": 2.181979466748466, "learning_rate": 0.0009840468163645462, "loss": 4.0891, "step": 950 }, { "epoch": 0.11, "grad_norm": 1.3130630616785433, "learning_rate": 0.0009840002498376322, "loss": 4.5199, "step": 951 }, { "epoch": 0.11, "grad_norm": 1.2792039211259303, "learning_rate": 0.0009839536165521094, "loss": 3.9934, "step": 952 }, { "epoch": 0.11, "grad_norm": 2.329169065283969, "learning_rate": 0.0009839069165144103, "loss": 4.1398, "step": 953 }, { "epoch": 0.11, "grad_norm": 1.3938295842440709, "learning_rate": 0.0009838601497309763, "loss": 3.9428, "step": 954 }, { "epoch": 0.11, "grad_norm": 1.288277953998746, "learning_rate": 0.0009838133162082578, "loss": 4.027, "step": 955 }, { "epoch": 0.11, "grad_norm": 1.9688762247808194, "learning_rate": 0.0009837664159527146, "loss": 4.0722, "step": 956 }, { "epoch": 0.11, "grad_norm": 1.8943500820923698, "learning_rate": 0.0009837194489708157, "loss": 4.232, "step": 957 }, { "epoch": 0.11, "grad_norm": 1.2687299724279713, "learning_rate": 0.0009836724152690395, "loss": 4.1829, "step": 958 }, { "epoch": 0.11, "grad_norm": 2.035159445592103, "learning_rate": 0.0009836253148538731, "loss": 4.1253, "step": 959 }, { "epoch": 0.11, "grad_norm": 1.8930080439775003, "learning_rate": 0.0009835781477318133, "loss": 4.0971, "step": 960 }, { "epoch": 0.11, "grad_norm": 1.676864331376414, "learning_rate": 0.000983530913909366, "loss": 4.2488, "step": 961 }, { "epoch": 0.11, "grad_norm": 2.5209529200647087, "learning_rate": 0.0009834836133930458, "loss": 4.0922, "step": 962 }, { "epoch": 0.11, "grad_norm": 3.405998891887921, "learning_rate": 0.0009834362461893773, "loss": 4.1217, "step": 963 }, { "epoch": 0.11, "grad_norm": 1.2998916999103685, "learning_rate": 0.0009833888123048937, "loss": 4.1057, "step": 964 }, { "epoch": 0.11, "grad_norm": 1.227198930117053, "learning_rate": 0.0009833413117461378, "loss": 4.0129, "step": 965 }, { "epoch": 0.11, "grad_norm": 1.844675821022563, "learning_rate": 0.0009832937445196613, "loss": 3.9225, "step": 966 }, { "epoch": 0.11, "grad_norm": 1.2856203256198022, "learning_rate": 0.000983246110632025, "loss": 4.2927, "step": 967 }, { "epoch": 0.11, "grad_norm": 1.3499265315367033, "learning_rate": 0.0009831984100897994, "loss": 4.0845, "step": 968 }, { "epoch": 0.11, "grad_norm": 1.365485153166664, "learning_rate": 0.0009831506428995636, "loss": 4.1339, "step": 969 }, { "epoch": 0.11, "grad_norm": 1.9620781794217081, "learning_rate": 0.0009831028090679064, "loss": 4.0015, "step": 970 }, { "epoch": 0.11, "grad_norm": 1.3870325556509953, "learning_rate": 0.0009830549086014254, "loss": 4.0252, "step": 971 }, { "epoch": 0.11, "grad_norm": 1.3528615473448937, "learning_rate": 0.0009830069415067276, "loss": 3.881, "step": 972 }, { "epoch": 0.11, "grad_norm": 1.209253150809271, "learning_rate": 0.0009829589077904293, "loss": 4.0272, "step": 973 }, { "epoch": 0.11, "grad_norm": 1.760007132236707, "learning_rate": 0.0009829108074591556, "loss": 3.9374, "step": 974 }, { "epoch": 0.11, "grad_norm": 1.3577312593450195, "learning_rate": 0.0009828626405195412, "loss": 3.6868, "step": 975 }, { "epoch": 0.11, "grad_norm": 1.3316880336446062, "learning_rate": 0.0009828144069782296, "loss": 4.1142, "step": 976 }, { "epoch": 0.11, "grad_norm": 1.268857393165799, "learning_rate": 0.0009827661068418738, "loss": 4.1492, "step": 977 }, { "epoch": 0.11, "grad_norm": 2.100966058392433, "learning_rate": 0.0009827177401171361, "loss": 4.1124, "step": 978 }, { "epoch": 0.11, "grad_norm": 1.1547824789843888, "learning_rate": 0.0009826693068106876, "loss": 4.0674, "step": 979 }, { "epoch": 0.11, "grad_norm": 1.4071624064383363, "learning_rate": 0.0009826208069292086, "loss": 4.0377, "step": 980 }, { "epoch": 0.11, "grad_norm": 1.2645346725197648, "learning_rate": 0.000982572240479389, "loss": 4.0718, "step": 981 }, { "epoch": 0.11, "grad_norm": 1.386635144860966, "learning_rate": 0.0009825236074679274, "loss": 4.0349, "step": 982 }, { "epoch": 0.11, "grad_norm": 1.6106752413292151, "learning_rate": 0.0009824749079015318, "loss": 4.1634, "step": 983 }, { "epoch": 0.11, "grad_norm": 1.2669495504852215, "learning_rate": 0.0009824261417869197, "loss": 3.8804, "step": 984 }, { "epoch": 0.11, "grad_norm": 2.264864491208418, "learning_rate": 0.000982377309130817, "loss": 4.0948, "step": 985 }, { "epoch": 0.11, "grad_norm": 9.629022646741673, "learning_rate": 0.0009823284099399596, "loss": 4.1391, "step": 986 }, { "epoch": 0.11, "grad_norm": 3.6756748888303976, "learning_rate": 0.000982279444221092, "loss": 4.1657, "step": 987 }, { "epoch": 0.11, "grad_norm": 1.3972053789425949, "learning_rate": 0.0009822304119809682, "loss": 4.0637, "step": 988 }, { "epoch": 0.11, "grad_norm": 1.3785564453596921, "learning_rate": 0.0009821813132263513, "loss": 3.8942, "step": 989 }, { "epoch": 0.11, "grad_norm": 1.5263326790702567, "learning_rate": 0.0009821321479640134, "loss": 4.2074, "step": 990 }, { "epoch": 0.11, "grad_norm": 1.3418981849872254, "learning_rate": 0.0009820829162007357, "loss": 3.9839, "step": 991 }, { "epoch": 0.11, "grad_norm": 1.632200451528027, "learning_rate": 0.0009820336179433091, "loss": 4.2072, "step": 992 }, { "epoch": 0.11, "grad_norm": 1.4976915080384634, "learning_rate": 0.0009819842531985337, "loss": 4.0729, "step": 993 }, { "epoch": 0.11, "grad_norm": 2.5084418932473986, "learning_rate": 0.0009819348219732176, "loss": 4.0746, "step": 994 }, { "epoch": 0.11, "grad_norm": 1.7042560662751853, "learning_rate": 0.0009818853242741796, "loss": 4.0666, "step": 995 }, { "epoch": 0.11, "grad_norm": 1.2228583641132351, "learning_rate": 0.0009818357601082467, "loss": 3.987, "step": 996 }, { "epoch": 0.11, "grad_norm": 1.4302449706105849, "learning_rate": 0.0009817861294822551, "loss": 4.0448, "step": 997 }, { "epoch": 0.11, "grad_norm": 1.2514956009301395, "learning_rate": 0.0009817364324030506, "loss": 4.0062, "step": 998 }, { "epoch": 0.11, "grad_norm": 3.309429543878451, "learning_rate": 0.0009816866688774882, "loss": 4.1273, "step": 999 }, { "epoch": 0.11, "grad_norm": 2.740596173724092, "learning_rate": 0.0009816368389124314, "loss": 4.3242, "step": 1000 }, { "epoch": 0.11, "grad_norm": 1.3994404964673302, "learning_rate": 0.0009815869425147537, "loss": 4.0991, "step": 1001 }, { "epoch": 0.11, "grad_norm": 1.3436150255579873, "learning_rate": 0.0009815369796913373, "loss": 4.2165, "step": 1002 }, { "epoch": 0.12, "grad_norm": 1.68181278173322, "learning_rate": 0.0009814869504490731, "loss": 4.0639, "step": 1003 }, { "epoch": 0.12, "grad_norm": 2.2388807592389925, "learning_rate": 0.0009814368547948623, "loss": 4.1277, "step": 1004 }, { "epoch": 0.12, "grad_norm": 1.3561129199390467, "learning_rate": 0.0009813866927356142, "loss": 4.2392, "step": 1005 }, { "epoch": 0.12, "grad_norm": 1.5839929049675414, "learning_rate": 0.000981336464278248, "loss": 4.1211, "step": 1006 }, { "epoch": 0.12, "grad_norm": 1.5935586867902607, "learning_rate": 0.0009812861694296917, "loss": 4.266, "step": 1007 }, { "epoch": 0.12, "grad_norm": 1.6203917772923815, "learning_rate": 0.0009812358081968825, "loss": 3.9377, "step": 1008 }, { "epoch": 0.12, "grad_norm": 1.4254996074901702, "learning_rate": 0.0009811853805867668, "loss": 3.9319, "step": 1009 }, { "epoch": 0.12, "grad_norm": 1.0905368703268417, "learning_rate": 0.0009811348866063, "loss": 4.0978, "step": 1010 }, { "epoch": 0.12, "grad_norm": 1.3401803272898807, "learning_rate": 0.0009810843262624467, "loss": 4.3479, "step": 1011 }, { "epoch": 0.12, "grad_norm": 1.4739184580623041, "learning_rate": 0.000981033699562181, "loss": 4.346, "step": 1012 }, { "epoch": 0.12, "grad_norm": 1.2480651116184605, "learning_rate": 0.0009809830065124858, "loss": 4.2576, "step": 1013 }, { "epoch": 0.12, "grad_norm": 1.0933041686827845, "learning_rate": 0.0009809322471203534, "loss": 4.0361, "step": 1014 }, { "epoch": 0.12, "grad_norm": 1.3307053189659381, "learning_rate": 0.0009808814213927847, "loss": 4.1843, "step": 1015 }, { "epoch": 0.12, "grad_norm": 3.004333932171263, "learning_rate": 0.0009808305293367904, "loss": 4.2086, "step": 1016 }, { "epoch": 0.12, "grad_norm": 1.3972060412551772, "learning_rate": 0.00098077957095939, "loss": 4.1009, "step": 1017 }, { "epoch": 0.12, "grad_norm": 1.559858818080783, "learning_rate": 0.0009807285462676122, "loss": 4.1411, "step": 1018 }, { "epoch": 0.12, "grad_norm": 2.1659783582623784, "learning_rate": 0.0009806774552684953, "loss": 4.1533, "step": 1019 }, { "epoch": 0.12, "grad_norm": 1.6084813147595303, "learning_rate": 0.0009806262979690857, "loss": 3.9327, "step": 1020 }, { "epoch": 0.12, "grad_norm": 1.2312972306972312, "learning_rate": 0.00098057507437644, "loss": 3.8422, "step": 1021 }, { "epoch": 0.12, "grad_norm": 1.6994436041934444, "learning_rate": 0.0009805237844976234, "loss": 4.0638, "step": 1022 }, { "epoch": 0.12, "grad_norm": 1.1389603372333486, "learning_rate": 0.00098047242833971, "loss": 4.1195, "step": 1023 }, { "epoch": 0.12, "grad_norm": 1.9876011734983448, "learning_rate": 0.0009804210059097841, "loss": 4.0816, "step": 1024 }, { "epoch": 0.12, "grad_norm": 1.2724334989002284, "learning_rate": 0.0009803695172149382, "loss": 4.0182, "step": 1025 }, { "epoch": 0.12, "grad_norm": 4.4662360571998265, "learning_rate": 0.0009803179622622738, "loss": 3.9391, "step": 1026 }, { "epoch": 0.12, "grad_norm": 1.5104018685929173, "learning_rate": 0.0009802663410589023, "loss": 4.1734, "step": 1027 }, { "epoch": 0.12, "grad_norm": 1.857500538465123, "learning_rate": 0.0009802146536119437, "loss": 3.9785, "step": 1028 }, { "epoch": 0.12, "grad_norm": 4.742126918184188, "learning_rate": 0.0009801628999285274, "loss": 3.9753, "step": 1029 }, { "epoch": 0.12, "grad_norm": 1.1382199027096016, "learning_rate": 0.000980111080015792, "loss": 4.1677, "step": 1030 }, { "epoch": 0.12, "grad_norm": 1.3450036396282992, "learning_rate": 0.0009800591938808846, "loss": 3.8564, "step": 1031 }, { "epoch": 0.12, "grad_norm": 2.6023085408242457, "learning_rate": 0.0009800072415309623, "loss": 4.2234, "step": 1032 }, { "epoch": 0.12, "grad_norm": 0.6417002694641387, "learning_rate": 0.0009799552229731907, "loss": 3.8426, "step": 1033 }, { "epoch": 0.12, "grad_norm": 1.3417501041092172, "learning_rate": 0.0009799031382147448, "loss": 4.1813, "step": 1034 }, { "epoch": 0.12, "grad_norm": 1.35108028736953, "learning_rate": 0.000979850987262809, "loss": 4.0535, "step": 1035 }, { "epoch": 0.12, "grad_norm": 1.3725344050755355, "learning_rate": 0.0009797987701245761, "loss": 3.9319, "step": 1036 }, { "epoch": 0.12, "grad_norm": 1.7480368063176088, "learning_rate": 0.0009797464868072487, "loss": 4.138, "step": 1037 }, { "epoch": 0.12, "grad_norm": 1.4435541337499989, "learning_rate": 0.0009796941373180384, "loss": 4.2762, "step": 1038 }, { "epoch": 0.12, "grad_norm": 1.4561797992272978, "learning_rate": 0.0009796417216641653, "loss": 4.1518, "step": 1039 }, { "epoch": 0.12, "grad_norm": 1.4406051956442751, "learning_rate": 0.00097958923985286, "loss": 4.0205, "step": 1040 }, { "epoch": 0.12, "grad_norm": 1.4427052018557125, "learning_rate": 0.0009795366918913604, "loss": 4.1441, "step": 1041 }, { "epoch": 0.12, "grad_norm": 2.149424863788346, "learning_rate": 0.0009794840777869152, "loss": 4.108, "step": 1042 }, { "epoch": 0.12, "grad_norm": 1.4661568487383416, "learning_rate": 0.0009794313975467813, "loss": 4.2829, "step": 1043 }, { "epoch": 0.12, "grad_norm": 1.8065930134751198, "learning_rate": 0.0009793786511782248, "loss": 4.016, "step": 1044 }, { "epoch": 0.12, "grad_norm": 1.5059879440458428, "learning_rate": 0.000979325838688521, "loss": 4.0767, "step": 1045 }, { "epoch": 0.12, "grad_norm": 1.4964459269831678, "learning_rate": 0.000979272960084955, "loss": 4.1972, "step": 1046 }, { "epoch": 0.12, "grad_norm": 1.295796562643235, "learning_rate": 0.0009792200153748195, "loss": 4.0989, "step": 1047 }, { "epoch": 0.12, "grad_norm": 2.2406268353811734, "learning_rate": 0.0009791670045654177, "loss": 4.125, "step": 1048 }, { "epoch": 0.12, "grad_norm": 1.0524709211378152, "learning_rate": 0.0009791139276640614, "loss": 3.8674, "step": 1049 }, { "epoch": 0.12, "grad_norm": 1.6216480642342395, "learning_rate": 0.0009790607846780718, "loss": 4.0896, "step": 1050 }, { "epoch": 0.12, "grad_norm": 1.853532916363565, "learning_rate": 0.0009790075756147783, "loss": 4.0757, "step": 1051 }, { "epoch": 0.12, "grad_norm": 1.5602734766725403, "learning_rate": 0.0009789543004815207, "loss": 4.0081, "step": 1052 }, { "epoch": 0.12, "grad_norm": 2.0224790166263475, "learning_rate": 0.000978900959285647, "loss": 4.2737, "step": 1053 }, { "epoch": 0.12, "grad_norm": 1.2253468503468135, "learning_rate": 0.0009788475520345146, "loss": 4.323, "step": 1054 }, { "epoch": 0.12, "grad_norm": 1.632490428083883, "learning_rate": 0.0009787940787354902, "loss": 4.0436, "step": 1055 }, { "epoch": 0.12, "grad_norm": 1.7598618455157506, "learning_rate": 0.000978740539395949, "loss": 4.4733, "step": 1056 }, { "epoch": 0.12, "grad_norm": 1.8763717556596151, "learning_rate": 0.0009786869340232761, "loss": 4.3513, "step": 1057 }, { "epoch": 0.12, "grad_norm": 2.043636702403181, "learning_rate": 0.0009786332626248655, "loss": 3.9846, "step": 1058 }, { "epoch": 0.12, "grad_norm": 1.5165179974184917, "learning_rate": 0.0009785795252081199, "loss": 4.1594, "step": 1059 }, { "epoch": 0.12, "grad_norm": 1.7713577665696567, "learning_rate": 0.000978525721780451, "loss": 4.1367, "step": 1060 }, { "epoch": 0.12, "grad_norm": 1.4577143394932066, "learning_rate": 0.0009784718523492804, "loss": 3.9756, "step": 1061 }, { "epoch": 0.12, "grad_norm": 1.3189643399486193, "learning_rate": 0.0009784179169220384, "loss": 4.2711, "step": 1062 }, { "epoch": 0.12, "grad_norm": 1.1688922704435758, "learning_rate": 0.0009783639155061643, "loss": 3.8217, "step": 1063 }, { "epoch": 0.12, "grad_norm": 1.1901712017552375, "learning_rate": 0.0009783098481091063, "loss": 4.0692, "step": 1064 }, { "epoch": 0.12, "grad_norm": 3.5640723009476853, "learning_rate": 0.0009782557147383225, "loss": 3.9555, "step": 1065 }, { "epoch": 0.12, "grad_norm": 1.337447186323066, "learning_rate": 0.0009782015154012789, "loss": 4.1082, "step": 1066 }, { "epoch": 0.12, "grad_norm": 1.3503852124519937, "learning_rate": 0.0009781472501054517, "loss": 3.9999, "step": 1067 }, { "epoch": 0.12, "grad_norm": 1.7679058493549251, "learning_rate": 0.0009780929188583256, "loss": 4.071, "step": 1068 }, { "epoch": 0.12, "grad_norm": 2.8203269847288563, "learning_rate": 0.000978038521667395, "loss": 4.0376, "step": 1069 }, { "epoch": 0.12, "grad_norm": 1.3906106855878695, "learning_rate": 0.000977984058540162, "loss": 4.0321, "step": 1070 }, { "epoch": 0.12, "grad_norm": 1.112966224770276, "learning_rate": 0.0009779295294841397, "loss": 3.9508, "step": 1071 }, { "epoch": 0.12, "grad_norm": 1.8071022465302546, "learning_rate": 0.0009778749345068487, "loss": 4.0672, "step": 1072 }, { "epoch": 0.12, "grad_norm": 1.1747255349007695, "learning_rate": 0.00097782027361582, "loss": 4.1061, "step": 1073 }, { "epoch": 0.12, "grad_norm": 7.505822481279657, "learning_rate": 0.0009777655468185924, "loss": 4.0025, "step": 1074 }, { "epoch": 0.12, "grad_norm": 19.73738615931978, "learning_rate": 0.0009777107541227147, "loss": 4.1069, "step": 1075 }, { "epoch": 0.12, "grad_norm": 1.4956023099881057, "learning_rate": 0.0009776558955357443, "loss": 4.0999, "step": 1076 }, { "epoch": 0.12, "grad_norm": 7.1883516800643505, "learning_rate": 0.0009776009710652483, "loss": 4.0226, "step": 1077 }, { "epoch": 0.12, "grad_norm": 2.945352618167134, "learning_rate": 0.0009775459807188022, "loss": 3.9979, "step": 1078 }, { "epoch": 0.12, "grad_norm": 6.7232738993148935, "learning_rate": 0.0009774909245039909, "loss": 3.8986, "step": 1079 }, { "epoch": 0.12, "grad_norm": 6.022599707549054, "learning_rate": 0.0009774358024284082, "loss": 4.1055, "step": 1080 }, { "epoch": 0.12, "grad_norm": 1.733954078224517, "learning_rate": 0.0009773806144996575, "loss": 4.0344, "step": 1081 }, { "epoch": 0.12, "grad_norm": 1.7181562410431956, "learning_rate": 0.0009773253607253507, "loss": 4.0297, "step": 1082 }, { "epoch": 0.12, "grad_norm": 1.4837372905504802, "learning_rate": 0.000977270041113109, "loss": 3.9395, "step": 1083 }, { "epoch": 0.12, "grad_norm": 1.2960214372781331, "learning_rate": 0.0009772146556705629, "loss": 4.033, "step": 1084 }, { "epoch": 0.12, "grad_norm": 2.7294587762855853, "learning_rate": 0.0009771592044053512, "loss": 4.1999, "step": 1085 }, { "epoch": 0.12, "grad_norm": 2.4835132535215307, "learning_rate": 0.000977103687325123, "loss": 4.3498, "step": 1086 }, { "epoch": 0.12, "grad_norm": 1.6367496867805915, "learning_rate": 0.0009770481044375356, "loss": 3.7739, "step": 1087 }, { "epoch": 0.12, "grad_norm": 2.505620721378224, "learning_rate": 0.0009769924557502553, "loss": 4.1072, "step": 1088 }, { "epoch": 0.12, "grad_norm": 1.5072089113957203, "learning_rate": 0.0009769367412709585, "loss": 4.1192, "step": 1089 }, { "epoch": 0.12, "grad_norm": 2.237455036694607, "learning_rate": 0.0009768809610073291, "loss": 3.9688, "step": 1090 }, { "epoch": 0.13, "grad_norm": 3.5881074272961926, "learning_rate": 0.0009768251149670614, "loss": 4.0566, "step": 1091 }, { "epoch": 0.13, "grad_norm": 1.5306004445254562, "learning_rate": 0.000976769203157858, "loss": 4.0926, "step": 1092 }, { "epoch": 0.13, "grad_norm": 2.459108469873042, "learning_rate": 0.0009767132255874315, "loss": 4.1152, "step": 1093 }, { "epoch": 0.13, "grad_norm": 1.529228103854115, "learning_rate": 0.0009766571822635022, "loss": 4.019, "step": 1094 }, { "epoch": 0.13, "grad_norm": 2.908086180827562, "learning_rate": 0.0009766010731938007, "loss": 4.1669, "step": 1095 }, { "epoch": 0.13, "grad_norm": 1.7751665987098173, "learning_rate": 0.0009765448983860658, "loss": 4.0237, "step": 1096 }, { "epoch": 0.13, "grad_norm": 2.780538524386183, "learning_rate": 0.0009764886578480461, "loss": 3.9284, "step": 1097 }, { "epoch": 0.13, "grad_norm": 3.6282705290879527, "learning_rate": 0.0009764323515874986, "loss": 4.2978, "step": 1098 }, { "epoch": 0.13, "grad_norm": 1.229668074018975, "learning_rate": 0.00097637597961219, "loss": 4.2336, "step": 1099 }, { "epoch": 0.13, "grad_norm": 5.90286393309262, "learning_rate": 0.0009763195419298955, "loss": 4.1176, "step": 1100 }, { "epoch": 0.13, "grad_norm": 1.7977846055061615, "learning_rate": 0.0009762630385483997, "loss": 4.0484, "step": 1101 }, { "epoch": 0.13, "grad_norm": 3.266000513231285, "learning_rate": 0.000976206469475496, "loss": 4.0218, "step": 1102 }, { "epoch": 0.13, "grad_norm": 2.1262887959835433, "learning_rate": 0.0009761498347189872, "loss": 4.2279, "step": 1103 }, { "epoch": 0.13, "grad_norm": 1.5292156519741424, "learning_rate": 0.000976093134286685, "loss": 4.1973, "step": 1104 }, { "epoch": 0.13, "grad_norm": 3.8660413142325707, "learning_rate": 0.0009760363681864102, "loss": 4.1356, "step": 1105 }, { "epoch": 0.13, "grad_norm": 2.830671950157151, "learning_rate": 0.0009759795364259923, "loss": 4.0439, "step": 1106 }, { "epoch": 0.13, "grad_norm": 1.406535612633439, "learning_rate": 0.0009759226390132704, "loss": 4.1409, "step": 1107 }, { "epoch": 0.13, "grad_norm": 1.3252716022105868, "learning_rate": 0.0009758656759560923, "loss": 4.027, "step": 1108 }, { "epoch": 0.13, "grad_norm": 1.4035213957490544, "learning_rate": 0.0009758086472623151, "loss": 4.0817, "step": 1109 }, { "epoch": 0.13, "grad_norm": 1.3241142438517688, "learning_rate": 0.0009757515529398047, "loss": 4.0039, "step": 1110 }, { "epoch": 0.13, "grad_norm": 1.6201899973034595, "learning_rate": 0.0009756943929964363, "loss": 4.0669, "step": 1111 }, { "epoch": 0.13, "grad_norm": 4.673830349802685, "learning_rate": 0.0009756371674400939, "loss": 4.1111, "step": 1112 }, { "epoch": 0.13, "grad_norm": 5.694745065675966, "learning_rate": 0.0009755798762786707, "loss": 4.1554, "step": 1113 }, { "epoch": 0.13, "grad_norm": 4.02136855399558, "learning_rate": 0.0009755225195200689, "loss": 3.8379, "step": 1114 }, { "epoch": 0.13, "grad_norm": 2.6983553653858396, "learning_rate": 0.0009754650971722, "loss": 4.1691, "step": 1115 }, { "epoch": 0.13, "grad_norm": 1.6126751253997182, "learning_rate": 0.000975407609242984, "loss": 4.0907, "step": 1116 }, { "epoch": 0.13, "grad_norm": 2.772704485458161, "learning_rate": 0.0009753500557403504, "loss": 4.1163, "step": 1117 }, { "epoch": 0.13, "grad_norm": 5.909645065695009, "learning_rate": 0.0009752924366722376, "loss": 4.0528, "step": 1118 }, { "epoch": 0.13, "grad_norm": 2.582678425506499, "learning_rate": 0.0009752347520465931, "loss": 4.2544, "step": 1119 }, { "epoch": 0.13, "grad_norm": 2.135384226358423, "learning_rate": 0.0009751770018713734, "loss": 4.0716, "step": 1120 }, { "epoch": 0.13, "grad_norm": 1.5509236683454088, "learning_rate": 0.0009751191861545439, "loss": 4.0539, "step": 1121 }, { "epoch": 0.13, "grad_norm": 3.8330722338516345, "learning_rate": 0.0009750613049040792, "loss": 4.3145, "step": 1122 }, { "epoch": 0.13, "grad_norm": 1.8705910258716192, "learning_rate": 0.0009750033581279632, "loss": 3.9832, "step": 1123 }, { "epoch": 0.13, "grad_norm": 1.7921789245545439, "learning_rate": 0.0009749453458341882, "loss": 4.3183, "step": 1124 }, { "epoch": 0.13, "grad_norm": 2.138390123831862, "learning_rate": 0.000974887268030756, "loss": 4.1324, "step": 1125 }, { "epoch": 0.13, "grad_norm": 2.941418227669491, "learning_rate": 0.0009748291247256774, "loss": 4.1205, "step": 1126 }, { "epoch": 0.13, "grad_norm": 3.192724086851021, "learning_rate": 0.000974770915926972, "loss": 4.2612, "step": 1127 }, { "epoch": 0.13, "grad_norm": 1.5124085016406768, "learning_rate": 0.0009747126416426688, "loss": 4.0929, "step": 1128 }, { "epoch": 0.13, "grad_norm": 2.2329568430831563, "learning_rate": 0.0009746543018808057, "loss": 3.982, "step": 1129 }, { "epoch": 0.13, "grad_norm": 1.762890064648763, "learning_rate": 0.000974595896649429, "loss": 4.1051, "step": 1130 }, { "epoch": 0.13, "grad_norm": 2.637682885345321, "learning_rate": 0.0009745374259565953, "loss": 3.9922, "step": 1131 }, { "epoch": 0.13, "grad_norm": 25.32453668091776, "learning_rate": 0.0009744788898103691, "loss": 3.864, "step": 1132 }, { "epoch": 0.13, "grad_norm": 2.235267057631708, "learning_rate": 0.0009744202882188245, "loss": 4.0362, "step": 1133 }, { "epoch": 0.13, "grad_norm": 2.5463736500752185, "learning_rate": 0.0009743616211900443, "loss": 4.4682, "step": 1134 }, { "epoch": 0.13, "grad_norm": 3.1696326318696917, "learning_rate": 0.0009743028887321206, "loss": 4.2947, "step": 1135 }, { "epoch": 0.13, "grad_norm": 2.3657842728550156, "learning_rate": 0.0009742440908531545, "loss": 3.9849, "step": 1136 }, { "epoch": 0.13, "grad_norm": 2.596779691656245, "learning_rate": 0.0009741852275612559, "loss": 4.1514, "step": 1137 }, { "epoch": 0.13, "grad_norm": 7.596315845767274, "learning_rate": 0.0009741262988645441, "loss": 3.7167, "step": 1138 }, { "epoch": 0.13, "grad_norm": 1.9126708194639253, "learning_rate": 0.000974067304771147, "loss": 4.3363, "step": 1139 }, { "epoch": 0.13, "grad_norm": 1.6110687836218174, "learning_rate": 0.0009740082452892017, "loss": 4.0712, "step": 1140 }, { "epoch": 0.13, "grad_norm": 1.6733970836922054, "learning_rate": 0.0009739491204268545, "loss": 4.0262, "step": 1141 }, { "epoch": 0.13, "grad_norm": 1.7257675928100378, "learning_rate": 0.0009738899301922602, "loss": 4.0802, "step": 1142 }, { "epoch": 0.13, "grad_norm": 1.1863870995065555, "learning_rate": 0.0009738306745935833, "loss": 3.8252, "step": 1143 }, { "epoch": 0.13, "grad_norm": 1.6957810667079547, "learning_rate": 0.0009737713536389969, "loss": 4.3105, "step": 1144 }, { "epoch": 0.13, "grad_norm": 1.5535650103440908, "learning_rate": 0.0009737119673366832, "loss": 4.1927, "step": 1145 }, { "epoch": 0.13, "grad_norm": 1.4085239977897657, "learning_rate": 0.0009736525156948333, "loss": 4.2331, "step": 1146 }, { "epoch": 0.13, "grad_norm": 1.482729441336617, "learning_rate": 0.0009735929987216476, "loss": 4.0458, "step": 1147 }, { "epoch": 0.13, "grad_norm": 1.42521424378714, "learning_rate": 0.0009735334164253351, "loss": 4.1557, "step": 1148 }, { "epoch": 0.13, "grad_norm": 1.6344966883995171, "learning_rate": 0.0009734737688141142, "loss": 4.0865, "step": 1149 }, { "epoch": 0.13, "grad_norm": 2.063509237275933, "learning_rate": 0.0009734140558962123, "loss": 4.0091, "step": 1150 }, { "epoch": 0.13, "grad_norm": 1.5605578223001404, "learning_rate": 0.0009733542776798653, "loss": 4.2015, "step": 1151 }, { "epoch": 0.13, "grad_norm": 1.7007938588954026, "learning_rate": 0.0009732944341733188, "loss": 4.1684, "step": 1152 }, { "epoch": 0.13, "grad_norm": 1.7755759228719439, "learning_rate": 0.0009732345253848267, "loss": 4.1435, "step": 1153 }, { "epoch": 0.13, "grad_norm": 2.056371155857814, "learning_rate": 0.0009731745513226526, "loss": 4.0788, "step": 1154 }, { "epoch": 0.13, "grad_norm": 2.0669681105368536, "learning_rate": 0.0009731145119950686, "loss": 4.1167, "step": 1155 }, { "epoch": 0.13, "grad_norm": 1.184587881729591, "learning_rate": 0.0009730544074103562, "loss": 4.1937, "step": 1156 }, { "epoch": 0.13, "grad_norm": 3.5739663166265543, "learning_rate": 0.0009729942375768055, "loss": 4.2401, "step": 1157 }, { "epoch": 0.13, "grad_norm": 1.6241792759188685, "learning_rate": 0.0009729340025027158, "loss": 4.2351, "step": 1158 }, { "epoch": 0.13, "grad_norm": 1.286344512268727, "learning_rate": 0.0009728737021963954, "loss": 4.2528, "step": 1159 }, { "epoch": 0.13, "grad_norm": 1.183168989995341, "learning_rate": 0.0009728133366661615, "loss": 3.9532, "step": 1160 }, { "epoch": 0.13, "grad_norm": 1.625774554341397, "learning_rate": 0.0009727529059203406, "loss": 4.0245, "step": 1161 }, { "epoch": 0.13, "grad_norm": 3.5853936988167363, "learning_rate": 0.0009726924099672676, "loss": 3.9436, "step": 1162 }, { "epoch": 0.13, "grad_norm": 2.4655659561991254, "learning_rate": 0.0009726318488152872, "loss": 3.9937, "step": 1163 }, { "epoch": 0.13, "grad_norm": 1.293292068654827, "learning_rate": 0.0009725712224727523, "loss": 4.0593, "step": 1164 }, { "epoch": 0.13, "grad_norm": 2.2580229821823985, "learning_rate": 0.0009725105309480253, "loss": 4.0255, "step": 1165 }, { "epoch": 0.13, "grad_norm": 1.2649146038014933, "learning_rate": 0.0009724497742494776, "loss": 4.0358, "step": 1166 }, { "epoch": 0.13, "grad_norm": 1.502828711963139, "learning_rate": 0.000972388952385489, "loss": 4.08, "step": 1167 }, { "epoch": 0.13, "grad_norm": 1.369117045013625, "learning_rate": 0.000972328065364449, "loss": 3.9623, "step": 1168 }, { "epoch": 0.13, "grad_norm": 1.626634147873937, "learning_rate": 0.0009722671131947559, "loss": 3.8834, "step": 1169 }, { "epoch": 0.13, "grad_norm": 1.729268137338893, "learning_rate": 0.0009722060958848168, "loss": 3.8334, "step": 1170 }, { "epoch": 0.13, "grad_norm": 1.2615683877921549, "learning_rate": 0.0009721450134430478, "loss": 3.9413, "step": 1171 }, { "epoch": 0.13, "grad_norm": 1.3382229111223451, "learning_rate": 0.000972083865877874, "loss": 4.2426, "step": 1172 }, { "epoch": 0.13, "grad_norm": 1.133084515838234, "learning_rate": 0.0009720226531977296, "loss": 4.2461, "step": 1173 }, { "epoch": 0.13, "grad_norm": 2.926007262463924, "learning_rate": 0.0009719613754110578, "loss": 4.2188, "step": 1174 }, { "epoch": 0.13, "grad_norm": 1.7958379054454545, "learning_rate": 0.0009719000325263109, "loss": 3.9876, "step": 1175 }, { "epoch": 0.13, "grad_norm": 1.464505996209407, "learning_rate": 0.0009718386245519495, "loss": 4.1761, "step": 1176 }, { "epoch": 0.13, "grad_norm": 4.789302472363499, "learning_rate": 0.0009717771514964439, "loss": 3.9937, "step": 1177 }, { "epoch": 0.14, "grad_norm": 9.591048357269306, "learning_rate": 0.0009717156133682734, "loss": 4.0223, "step": 1178 }, { "epoch": 0.14, "grad_norm": 1.9693337042685883, "learning_rate": 0.0009716540101759255, "loss": 4.1943, "step": 1179 }, { "epoch": 0.14, "grad_norm": 1.3153838705192533, "learning_rate": 0.0009715923419278976, "loss": 4.2516, "step": 1180 }, { "epoch": 0.14, "grad_norm": 1.6134539850799534, "learning_rate": 0.0009715306086326954, "loss": 4.0835, "step": 1181 }, { "epoch": 0.14, "grad_norm": 1.1200446709627152, "learning_rate": 0.0009714688102988339, "loss": 3.9325, "step": 1182 }, { "epoch": 0.14, "grad_norm": 1.412429322448943, "learning_rate": 0.000971406946934837, "loss": 4.187, "step": 1183 }, { "epoch": 0.14, "grad_norm": 1.899389336679635, "learning_rate": 0.0009713450185492378, "loss": 4.0047, "step": 1184 }, { "epoch": 0.14, "grad_norm": 1.6292706292778267, "learning_rate": 0.0009712830251505778, "loss": 4.2102, "step": 1185 }, { "epoch": 0.14, "grad_norm": 1.1659452331946953, "learning_rate": 0.0009712209667474079, "loss": 4.1017, "step": 1186 }, { "epoch": 0.14, "grad_norm": 2.4029556727148824, "learning_rate": 0.0009711588433482881, "loss": 4.2159, "step": 1187 }, { "epoch": 0.14, "grad_norm": 1.060880376637737, "learning_rate": 0.0009710966549617868, "loss": 4.0722, "step": 1188 }, { "epoch": 0.14, "grad_norm": 2.0122289566218052, "learning_rate": 0.0009710344015964819, "loss": 4.052, "step": 1189 }, { "epoch": 0.14, "grad_norm": 1.202898192885117, "learning_rate": 0.00097097208326096, "loss": 4.3066, "step": 1190 }, { "epoch": 0.14, "grad_norm": 0.9788473264941644, "learning_rate": 0.000970909699963817, "loss": 3.9217, "step": 1191 }, { "epoch": 0.14, "grad_norm": 1.5734932933148675, "learning_rate": 0.0009708472517136569, "loss": 4.3952, "step": 1192 }, { "epoch": 0.14, "grad_norm": 1.4736720828568306, "learning_rate": 0.0009707847385190938, "loss": 4.0112, "step": 1193 }, { "epoch": 0.14, "grad_norm": 1.4030659626895816, "learning_rate": 0.00097072216038875, "loss": 3.8787, "step": 1194 }, { "epoch": 0.14, "grad_norm": 1.5350065069787628, "learning_rate": 0.000970659517331257, "loss": 3.8433, "step": 1195 }, { "epoch": 0.14, "grad_norm": 1.368187572385733, "learning_rate": 0.000970596809355255, "loss": 4.0909, "step": 1196 }, { "epoch": 0.14, "grad_norm": 1.0813304860280735, "learning_rate": 0.0009705340364693935, "loss": 3.9371, "step": 1197 }, { "epoch": 0.14, "grad_norm": 1.0430868794277741, "learning_rate": 0.0009704711986823311, "loss": 4.0315, "step": 1198 }, { "epoch": 0.14, "grad_norm": 1.0475288990106282, "learning_rate": 0.0009704082960027348, "loss": 3.9342, "step": 1199 }, { "epoch": 0.14, "grad_norm": 1.4635709128611838, "learning_rate": 0.0009703453284392807, "loss": 4.5649, "step": 1200 }, { "epoch": 0.14, "grad_norm": 2.4791057071695652, "learning_rate": 0.0009702822960006544, "loss": 4.2289, "step": 1201 }, { "epoch": 0.14, "grad_norm": 1.0482846395279493, "learning_rate": 0.0009702191986955494, "loss": 4.1022, "step": 1202 }, { "epoch": 0.14, "grad_norm": 1.7617421925279648, "learning_rate": 0.0009701560365326694, "loss": 3.9233, "step": 1203 }, { "epoch": 0.14, "grad_norm": 1.2034225882681182, "learning_rate": 0.0009700928095207259, "loss": 4.0613, "step": 1204 }, { "epoch": 0.14, "grad_norm": 1.175794881050754, "learning_rate": 0.0009700295176684404, "loss": 3.9424, "step": 1205 }, { "epoch": 0.14, "grad_norm": 1.7086853877400412, "learning_rate": 0.0009699661609845425, "loss": 4.0776, "step": 1206 }, { "epoch": 0.14, "grad_norm": 0.9892153774509086, "learning_rate": 0.0009699027394777708, "loss": 4.0219, "step": 1207 }, { "epoch": 0.14, "grad_norm": 1.708628336613929, "learning_rate": 0.0009698392531568736, "loss": 4.058, "step": 1208 }, { "epoch": 0.14, "grad_norm": 2.2000615687608347, "learning_rate": 0.0009697757020306072, "loss": 4.1582, "step": 1209 }, { "epoch": 0.14, "grad_norm": 1.5600616955306619, "learning_rate": 0.0009697120861077375, "loss": 4.1324, "step": 1210 }, { "epoch": 0.14, "grad_norm": 1.447371231489673, "learning_rate": 0.0009696484053970391, "loss": 3.9114, "step": 1211 }, { "epoch": 0.14, "grad_norm": 1.2807356539943293, "learning_rate": 0.0009695846599072955, "loss": 4.1062, "step": 1212 }, { "epoch": 0.14, "grad_norm": 1.1237087206715486, "learning_rate": 0.0009695208496472991, "loss": 3.909, "step": 1213 }, { "epoch": 0.14, "grad_norm": 1.0281076429927727, "learning_rate": 0.0009694569746258514, "loss": 4.1287, "step": 1214 }, { "epoch": 0.14, "grad_norm": 1.23230225409678, "learning_rate": 0.0009693930348517628, "loss": 4.0125, "step": 1215 }, { "epoch": 0.14, "grad_norm": 1.1707693038226896, "learning_rate": 0.0009693290303338524, "loss": 4.2016, "step": 1216 }, { "epoch": 0.14, "grad_norm": 1.8220015913333334, "learning_rate": 0.0009692649610809485, "loss": 3.9234, "step": 1217 }, { "epoch": 0.14, "grad_norm": 1.171289904044236, "learning_rate": 0.0009692008271018883, "loss": 4.0113, "step": 1218 }, { "epoch": 0.14, "grad_norm": 0.9322210573290838, "learning_rate": 0.0009691366284055176, "loss": 4.0734, "step": 1219 }, { "epoch": 0.14, "grad_norm": 1.734600010580748, "learning_rate": 0.0009690723650006917, "loss": 4.1134, "step": 1220 }, { "epoch": 0.14, "grad_norm": 1.4034797539200057, "learning_rate": 0.0009690080368962744, "loss": 3.9173, "step": 1221 }, { "epoch": 0.14, "grad_norm": 2.230064675134737, "learning_rate": 0.0009689436441011384, "loss": 3.9892, "step": 1222 }, { "epoch": 0.14, "grad_norm": 1.1759496255090602, "learning_rate": 0.0009688791866241657, "loss": 4.0527, "step": 1223 }, { "epoch": 0.14, "grad_norm": 0.9999327426342115, "learning_rate": 0.0009688146644742468, "loss": 4.0542, "step": 1224 }, { "epoch": 0.14, "grad_norm": 1.2076080774630304, "learning_rate": 0.0009687500776602813, "loss": 4.0501, "step": 1225 }, { "epoch": 0.14, "grad_norm": 1.2589524333294153, "learning_rate": 0.0009686854261911779, "loss": 4.0963, "step": 1226 }, { "epoch": 0.14, "grad_norm": 1.181302557175755, "learning_rate": 0.0009686207100758538, "loss": 4.0867, "step": 1227 }, { "epoch": 0.14, "grad_norm": 1.2853258965571746, "learning_rate": 0.0009685559293232355, "loss": 3.9698, "step": 1228 }, { "epoch": 0.14, "grad_norm": 1.2946259103227646, "learning_rate": 0.0009684910839422582, "loss": 4.065, "step": 1229 }, { "epoch": 0.14, "grad_norm": 1.4001821944181636, "learning_rate": 0.0009684261739418663, "loss": 4.2783, "step": 1230 }, { "epoch": 0.14, "grad_norm": 1.0353500364935149, "learning_rate": 0.0009683611993310127, "loss": 4.0717, "step": 1231 }, { "epoch": 0.14, "grad_norm": 1.1616579681261785, "learning_rate": 0.0009682961601186593, "loss": 3.8174, "step": 1232 }, { "epoch": 0.14, "grad_norm": 1.573493226943176, "learning_rate": 0.000968231056313777, "loss": 4.1308, "step": 1233 }, { "epoch": 0.14, "grad_norm": 0.9867736137347733, "learning_rate": 0.0009681658879253461, "loss": 4.0156, "step": 1234 }, { "epoch": 0.14, "grad_norm": 1.1173672291802164, "learning_rate": 0.0009681006549623548, "loss": 4.2013, "step": 1235 }, { "epoch": 0.14, "grad_norm": 2.274846398276094, "learning_rate": 0.000968035357433801, "loss": 4.0321, "step": 1236 }, { "epoch": 0.14, "grad_norm": 1.088672858079945, "learning_rate": 0.0009679699953486913, "loss": 4.1527, "step": 1237 }, { "epoch": 0.14, "grad_norm": 1.100629397740244, "learning_rate": 0.0009679045687160411, "loss": 4.1384, "step": 1238 }, { "epoch": 0.14, "grad_norm": 1.1462748563592338, "learning_rate": 0.0009678390775448745, "loss": 4.2478, "step": 1239 }, { "epoch": 0.14, "grad_norm": 1.0492219449073477, "learning_rate": 0.0009677735218442252, "loss": 3.9664, "step": 1240 }, { "epoch": 0.14, "grad_norm": 1.2427620803189514, "learning_rate": 0.0009677079016231349, "loss": 3.8478, "step": 1241 }, { "epoch": 0.14, "grad_norm": 1.1898124835911121, "learning_rate": 0.000967642216890655, "loss": 4.0103, "step": 1242 }, { "epoch": 0.14, "grad_norm": 1.1925283696271838, "learning_rate": 0.0009675764676558454, "loss": 4.1098, "step": 1243 }, { "epoch": 0.14, "grad_norm": 1.1315149953430668, "learning_rate": 0.000967510653927775, "loss": 3.857, "step": 1244 }, { "epoch": 0.14, "grad_norm": 1.0663978948122352, "learning_rate": 0.0009674447757155213, "loss": 4.1359, "step": 1245 }, { "epoch": 0.14, "grad_norm": 1.2091193298083687, "learning_rate": 0.0009673788330281709, "loss": 4.1407, "step": 1246 }, { "epoch": 0.14, "grad_norm": 0.9668406179669747, "learning_rate": 0.0009673128258748199, "loss": 4.1641, "step": 1247 }, { "epoch": 0.14, "grad_norm": 1.1313203287784486, "learning_rate": 0.0009672467542645722, "loss": 3.8521, "step": 1248 }, { "epoch": 0.14, "grad_norm": 0.9051565325531398, "learning_rate": 0.0009671806182065414, "loss": 3.91, "step": 1249 }, { "epoch": 0.14, "grad_norm": 2.7218377823035156, "learning_rate": 0.0009671144177098494, "loss": 4.0635, "step": 1250 }, { "epoch": 0.14, "grad_norm": 1.5917154358892416, "learning_rate": 0.0009670481527836276, "loss": 4.2512, "step": 1251 }, { "epoch": 0.14, "grad_norm": 2.949335319842319, "learning_rate": 0.000966981823437016, "loss": 3.9789, "step": 1252 }, { "epoch": 0.14, "grad_norm": 1.0595842352377676, "learning_rate": 0.0009669154296791632, "loss": 3.9934, "step": 1253 }, { "epoch": 0.14, "grad_norm": 1.3008971309196644, "learning_rate": 0.000966848971519227, "loss": 4.1803, "step": 1254 }, { "epoch": 0.14, "grad_norm": 0.8975477590740615, "learning_rate": 0.0009667824489663743, "loss": 3.9747, "step": 1255 }, { "epoch": 0.14, "grad_norm": 0.9708306319042626, "learning_rate": 0.0009667158620297803, "loss": 4.1685, "step": 1256 }, { "epoch": 0.14, "grad_norm": 1.1198466671227525, "learning_rate": 0.0009666492107186296, "loss": 3.9932, "step": 1257 }, { "epoch": 0.14, "grad_norm": 1.0031506625939528, "learning_rate": 0.0009665824950421155, "loss": 3.9145, "step": 1258 }, { "epoch": 0.14, "grad_norm": 1.372219557971629, "learning_rate": 0.00096651571500944, "loss": 4.028, "step": 1259 }, { "epoch": 0.14, "grad_norm": 2.7194277439301153, "learning_rate": 0.0009664488706298142, "loss": 3.8859, "step": 1260 }, { "epoch": 0.14, "grad_norm": 0.9792582731179532, "learning_rate": 0.0009663819619124581, "loss": 3.9173, "step": 1261 }, { "epoch": 0.14, "grad_norm": 1.57043400585257, "learning_rate": 0.0009663149888666003, "loss": 4.0343, "step": 1262 }, { "epoch": 0.14, "grad_norm": 1.20535109467719, "learning_rate": 0.0009662479515014786, "loss": 4.0682, "step": 1263 }, { "epoch": 0.14, "grad_norm": 1.2448245441073085, "learning_rate": 0.0009661808498263396, "loss": 3.9224, "step": 1264 }, { "epoch": 0.15, "grad_norm": 1.4597366068102628, "learning_rate": 0.0009661136838504385, "loss": 4.2079, "step": 1265 }, { "epoch": 0.15, "grad_norm": 1.503696882752742, "learning_rate": 0.0009660464535830395, "loss": 4.0442, "step": 1266 }, { "epoch": 0.15, "grad_norm": 1.4384827573708645, "learning_rate": 0.0009659791590334162, "loss": 4.1886, "step": 1267 }, { "epoch": 0.15, "grad_norm": 2.1675307798981214, "learning_rate": 0.00096591180021085, "loss": 4.0904, "step": 1268 }, { "epoch": 0.15, "grad_norm": 3.0452953514288006, "learning_rate": 0.0009658443771246322, "loss": 4.1226, "step": 1269 }, { "epoch": 0.15, "grad_norm": 1.4586499520173906, "learning_rate": 0.0009657768897840623, "loss": 3.9413, "step": 1270 }, { "epoch": 0.15, "grad_norm": 1.2391239979209134, "learning_rate": 0.000965709338198449, "loss": 4.3865, "step": 1271 }, { "epoch": 0.15, "grad_norm": 1.1342641062896803, "learning_rate": 0.0009656417223771097, "loss": 4.0781, "step": 1272 }, { "epoch": 0.15, "grad_norm": 1.5757764890124977, "learning_rate": 0.0009655740423293708, "loss": 3.9353, "step": 1273 }, { "epoch": 0.15, "grad_norm": 1.2688923644464694, "learning_rate": 0.0009655062980645673, "loss": 4.2811, "step": 1274 }, { "epoch": 0.15, "grad_norm": 0.9849643248901608, "learning_rate": 0.0009654384895920434, "loss": 4.0508, "step": 1275 }, { "epoch": 0.15, "grad_norm": 5.036748133032185, "learning_rate": 0.0009653706169211519, "loss": 3.8548, "step": 1276 }, { "epoch": 0.15, "grad_norm": 1.2571664466171992, "learning_rate": 0.0009653026800612545, "loss": 3.9961, "step": 1277 }, { "epoch": 0.15, "grad_norm": 4.757864758845839, "learning_rate": 0.0009652346790217221, "loss": 3.923, "step": 1278 }, { "epoch": 0.15, "grad_norm": 1.9013147030828397, "learning_rate": 0.0009651666138119337, "loss": 3.9726, "step": 1279 }, { "epoch": 0.15, "grad_norm": 1.323553532372245, "learning_rate": 0.000965098484441278, "loss": 3.9319, "step": 1280 }, { "epoch": 0.15, "grad_norm": 1.1472963295390621, "learning_rate": 0.0009650302909191517, "loss": 3.8899, "step": 1281 }, { "epoch": 0.15, "grad_norm": 1.2261983493948705, "learning_rate": 0.0009649620332549613, "loss": 4.0745, "step": 1282 }, { "epoch": 0.15, "grad_norm": 1.2243231229232525, "learning_rate": 0.0009648937114581212, "loss": 4.0078, "step": 1283 }, { "epoch": 0.15, "grad_norm": 1.3634238929559253, "learning_rate": 0.0009648253255380554, "loss": 4.0532, "step": 1284 }, { "epoch": 0.15, "grad_norm": 1.406255169927441, "learning_rate": 0.0009647568755041963, "loss": 3.911, "step": 1285 }, { "epoch": 0.15, "grad_norm": 1.0512116300036818, "learning_rate": 0.0009646883613659851, "loss": 3.9542, "step": 1286 }, { "epoch": 0.15, "grad_norm": 2.9377415271861103, "learning_rate": 0.0009646197831328725, "loss": 4.2428, "step": 1287 }, { "epoch": 0.15, "grad_norm": 1.457395879640007, "learning_rate": 0.0009645511408143171, "loss": 4.141, "step": 1288 }, { "epoch": 0.15, "grad_norm": 1.365403817307479, "learning_rate": 0.0009644824344197872, "loss": 4.0111, "step": 1289 }, { "epoch": 0.15, "grad_norm": 1.28671080071465, "learning_rate": 0.0009644136639587591, "loss": 4.1803, "step": 1290 }, { "epoch": 0.15, "grad_norm": 1.2155206689118963, "learning_rate": 0.0009643448294407186, "loss": 4.0017, "step": 1291 }, { "epoch": 0.15, "grad_norm": 1.4912997606084466, "learning_rate": 0.0009642759308751601, "loss": 3.9904, "step": 1292 }, { "epoch": 0.15, "grad_norm": 1.3057556663389507, "learning_rate": 0.0009642069682715868, "loss": 4.2431, "step": 1293 }, { "epoch": 0.15, "grad_norm": 1.220533582616741, "learning_rate": 0.0009641379416395109, "loss": 4.1601, "step": 1294 }, { "epoch": 0.15, "grad_norm": 1.0821147234867217, "learning_rate": 0.0009640688509884532, "loss": 4.1358, "step": 1295 }, { "epoch": 0.15, "grad_norm": 1.4502693684135126, "learning_rate": 0.0009639996963279435, "loss": 4.3724, "step": 1296 }, { "epoch": 0.15, "grad_norm": 1.046659706920914, "learning_rate": 0.0009639304776675204, "loss": 3.9911, "step": 1297 }, { "epoch": 0.15, "grad_norm": 1.158089983829308, "learning_rate": 0.0009638611950167311, "loss": 4.0686, "step": 1298 }, { "epoch": 0.15, "grad_norm": 1.1266203072465362, "learning_rate": 0.000963791848385132, "loss": 4.1208, "step": 1299 }, { "epoch": 0.15, "grad_norm": 1.186085571151658, "learning_rate": 0.000963722437782288, "loss": 4.0919, "step": 1300 }, { "epoch": 0.15, "grad_norm": 1.1863183337751864, "learning_rate": 0.0009636529632177732, "loss": 4.1347, "step": 1301 }, { "epoch": 0.15, "grad_norm": 1.4903577484460349, "learning_rate": 0.0009635834247011701, "loss": 4.0509, "step": 1302 }, { "epoch": 0.15, "grad_norm": 1.0987996099966162, "learning_rate": 0.0009635138222420703, "loss": 4.2607, "step": 1303 }, { "epoch": 0.15, "grad_norm": 1.129619497091781, "learning_rate": 0.000963444155850074, "loss": 3.934, "step": 1304 }, { "epoch": 0.15, "grad_norm": 1.3876273535259596, "learning_rate": 0.0009633744255347905, "loss": 3.9852, "step": 1305 }, { "epoch": 0.15, "grad_norm": 1.2887624791581556, "learning_rate": 0.0009633046313058378, "loss": 3.8653, "step": 1306 }, { "epoch": 0.15, "grad_norm": 1.3520890907642957, "learning_rate": 0.0009632347731728424, "loss": 3.8879, "step": 1307 }, { "epoch": 0.15, "grad_norm": 0.9645152069946163, "learning_rate": 0.0009631648511454402, "loss": 4.1206, "step": 1308 }, { "epoch": 0.15, "grad_norm": 2.1297168779533933, "learning_rate": 0.0009630948652332756, "loss": 4.0811, "step": 1309 }, { "epoch": 0.15, "grad_norm": 1.0096785648490434, "learning_rate": 0.0009630248154460017, "loss": 3.9295, "step": 1310 }, { "epoch": 0.15, "grad_norm": 1.246496002138543, "learning_rate": 0.0009629547017932805, "loss": 4.0243, "step": 1311 }, { "epoch": 0.15, "grad_norm": 1.1417190865074998, "learning_rate": 0.0009628845242847829, "loss": 3.7737, "step": 1312 }, { "epoch": 0.15, "grad_norm": 1.1269962549491621, "learning_rate": 0.0009628142829301886, "loss": 3.9618, "step": 1313 }, { "epoch": 0.15, "grad_norm": 1.1057197080607994, "learning_rate": 0.000962743977739186, "loss": 3.981, "step": 1314 }, { "epoch": 0.15, "grad_norm": 1.9089449533011953, "learning_rate": 0.0009626736087214724, "loss": 4.12, "step": 1315 }, { "epoch": 0.15, "grad_norm": 1.1271320807417062, "learning_rate": 0.0009626031758867538, "loss": 3.8698, "step": 1316 }, { "epoch": 0.15, "grad_norm": 1.236396468357275, "learning_rate": 0.0009625326792447451, "loss": 4.1488, "step": 1317 }, { "epoch": 0.15, "grad_norm": 1.1414273533367798, "learning_rate": 0.00096246211880517, "loss": 3.941, "step": 1318 }, { "epoch": 0.15, "grad_norm": 1.0671863542233166, "learning_rate": 0.000962391494577761, "loss": 3.8535, "step": 1319 }, { "epoch": 0.15, "grad_norm": 1.1995892805993402, "learning_rate": 0.0009623208065722592, "loss": 4.205, "step": 1320 }, { "epoch": 0.15, "grad_norm": 1.5435692383168593, "learning_rate": 0.0009622500547984147, "loss": 4.0761, "step": 1321 }, { "epoch": 0.15, "grad_norm": 1.005018071788545, "learning_rate": 0.0009621792392659867, "loss": 4.0368, "step": 1322 }, { "epoch": 0.15, "grad_norm": 0.9639813186533578, "learning_rate": 0.0009621083599847424, "loss": 3.9529, "step": 1323 }, { "epoch": 0.15, "grad_norm": 1.019276592025749, "learning_rate": 0.0009620374169644583, "loss": 4.0876, "step": 1324 }, { "epoch": 0.15, "grad_norm": 1.416777761598592, "learning_rate": 0.0009619664102149201, "loss": 4.1054, "step": 1325 }, { "epoch": 0.15, "grad_norm": 1.1675773069379947, "learning_rate": 0.0009618953397459211, "loss": 3.949, "step": 1326 }, { "epoch": 0.15, "grad_norm": 1.2247095296871755, "learning_rate": 0.0009618242055672648, "loss": 4.0318, "step": 1327 }, { "epoch": 0.15, "grad_norm": 1.0667052073001935, "learning_rate": 0.0009617530076887624, "loss": 4.2134, "step": 1328 }, { "epoch": 0.15, "grad_norm": 1.2185629345881743, "learning_rate": 0.0009616817461202345, "loss": 4.0196, "step": 1329 }, { "epoch": 0.15, "grad_norm": 1.7006289358194273, "learning_rate": 0.0009616104208715101, "loss": 3.8926, "step": 1330 }, { "epoch": 0.15, "grad_norm": 1.355717969206976, "learning_rate": 0.0009615390319524272, "loss": 4.1046, "step": 1331 }, { "epoch": 0.15, "grad_norm": 1.2019400150865323, "learning_rate": 0.0009614675793728327, "loss": 4.1714, "step": 1332 }, { "epoch": 0.15, "grad_norm": 2.4101437888345263, "learning_rate": 0.0009613960631425818, "loss": 3.8381, "step": 1333 }, { "epoch": 0.15, "grad_norm": 1.7769684422635261, "learning_rate": 0.000961324483271539, "loss": 3.9663, "step": 1334 }, { "epoch": 0.15, "grad_norm": 2.578736987366708, "learning_rate": 0.0009612528397695777, "loss": 3.9711, "step": 1335 }, { "epoch": 0.15, "grad_norm": 1.089750772198917, "learning_rate": 0.0009611811326465791, "loss": 4.0127, "step": 1336 }, { "epoch": 0.15, "grad_norm": 1.1556008190663898, "learning_rate": 0.0009611093619124344, "loss": 4.0346, "step": 1337 }, { "epoch": 0.15, "grad_norm": 1.024654198515152, "learning_rate": 0.0009610375275770427, "loss": 4.0658, "step": 1338 }, { "epoch": 0.15, "grad_norm": 1.4349697404589628, "learning_rate": 0.0009609656296503121, "loss": 3.978, "step": 1339 }, { "epoch": 0.15, "grad_norm": 1.2924864000307916, "learning_rate": 0.0009608936681421599, "loss": 4.08, "step": 1340 }, { "epoch": 0.15, "grad_norm": 1.196663858869992, "learning_rate": 0.0009608216430625114, "loss": 4.2919, "step": 1341 }, { "epoch": 0.15, "grad_norm": 4.578906048448203, "learning_rate": 0.0009607495544213014, "loss": 4.0169, "step": 1342 }, { "epoch": 0.15, "grad_norm": 2.134378074679334, "learning_rate": 0.000960677402228473, "loss": 4.074, "step": 1343 }, { "epoch": 0.15, "grad_norm": 1.5671325740533637, "learning_rate": 0.0009606051864939785, "loss": 3.8777, "step": 1344 }, { "epoch": 0.15, "grad_norm": 2.076004351644721, "learning_rate": 0.0009605329072277782, "loss": 3.9445, "step": 1345 }, { "epoch": 0.15, "grad_norm": 1.256574291636711, "learning_rate": 0.000960460564439842, "loss": 4.0719, "step": 1346 }, { "epoch": 0.15, "grad_norm": 1.52749799012796, "learning_rate": 0.0009603881581401482, "loss": 4.1018, "step": 1347 }, { "epoch": 0.15, "grad_norm": 0.9889018557033451, "learning_rate": 0.0009603156883386836, "loss": 4.1006, "step": 1348 }, { "epoch": 0.15, "grad_norm": 1.0692448073354026, "learning_rate": 0.0009602431550454442, "loss": 3.908, "step": 1349 }, { "epoch": 0.15, "grad_norm": 1.1622240185699446, "learning_rate": 0.0009601705582704348, "loss": 4.0077, "step": 1350 }, { "epoch": 0.15, "grad_norm": 1.3408961201610161, "learning_rate": 0.0009600978980236683, "loss": 4.0863, "step": 1351 }, { "epoch": 0.16, "grad_norm": 1.0616569579349835, "learning_rate": 0.0009600251743151672, "loss": 4.1235, "step": 1352 }, { "epoch": 0.16, "grad_norm": 1.0173414309734332, "learning_rate": 0.0009599523871549621, "loss": 3.9606, "step": 1353 }, { "epoch": 0.16, "grad_norm": 1.214905753731753, "learning_rate": 0.0009598795365530928, "loss": 3.9464, "step": 1354 }, { "epoch": 0.16, "grad_norm": 1.365260621568338, "learning_rate": 0.0009598066225196074, "loss": 4.0471, "step": 1355 }, { "epoch": 0.16, "grad_norm": 3.3994187419514663, "learning_rate": 0.0009597336450645633, "loss": 3.8945, "step": 1356 }, { "epoch": 0.16, "grad_norm": 1.128312245943589, "learning_rate": 0.000959660604198026, "loss": 4.0903, "step": 1357 }, { "epoch": 0.16, "grad_norm": 0.906013057107507, "learning_rate": 0.0009595874999300703, "loss": 3.9997, "step": 1358 }, { "epoch": 0.16, "grad_norm": 1.1645901496534543, "learning_rate": 0.0009595143322707795, "loss": 4.0854, "step": 1359 }, { "epoch": 0.16, "grad_norm": 1.6613647914770053, "learning_rate": 0.0009594411012302459, "loss": 3.7669, "step": 1360 }, { "epoch": 0.16, "grad_norm": 1.1865425736036217, "learning_rate": 0.0009593678068185701, "loss": 3.8812, "step": 1361 }, { "epoch": 0.16, "grad_norm": 1.0194530851502885, "learning_rate": 0.0009592944490458614, "loss": 4.0117, "step": 1362 }, { "epoch": 0.16, "grad_norm": 1.3784907019724726, "learning_rate": 0.0009592210279222386, "loss": 4.3796, "step": 1363 }, { "epoch": 0.16, "grad_norm": 1.387248568424601, "learning_rate": 0.0009591475434578286, "loss": 4.2694, "step": 1364 }, { "epoch": 0.16, "grad_norm": 1.0368403026689281, "learning_rate": 0.0009590739956627671, "loss": 3.9145, "step": 1365 }, { "epoch": 0.16, "grad_norm": 1.1792360673887636, "learning_rate": 0.0009590003845471987, "loss": 3.8593, "step": 1366 }, { "epoch": 0.16, "grad_norm": 1.1768648584026646, "learning_rate": 0.0009589267101212764, "loss": 3.7872, "step": 1367 }, { "epoch": 0.16, "grad_norm": 1.1559944603113808, "learning_rate": 0.0009588529723951625, "loss": 4.0693, "step": 1368 }, { "epoch": 0.16, "grad_norm": 2.8727158498594787, "learning_rate": 0.0009587791713790276, "loss": 4.0476, "step": 1369 }, { "epoch": 0.16, "grad_norm": 1.2263441938987976, "learning_rate": 0.0009587053070830512, "loss": 3.9883, "step": 1370 }, { "epoch": 0.16, "grad_norm": 1.207322717751473, "learning_rate": 0.0009586313795174213, "loss": 4.0071, "step": 1371 }, { "epoch": 0.16, "grad_norm": 1.297747437812692, "learning_rate": 0.0009585573886923349, "loss": 4.2083, "step": 1372 }, { "epoch": 0.16, "grad_norm": 1.090761419045336, "learning_rate": 0.0009584833346179977, "loss": 4.0697, "step": 1373 }, { "epoch": 0.16, "grad_norm": 1.1261615847874127, "learning_rate": 0.000958409217304624, "loss": 3.9385, "step": 1374 }, { "epoch": 0.16, "grad_norm": 1.0849362559286273, "learning_rate": 0.0009583350367624366, "loss": 3.9657, "step": 1375 }, { "epoch": 0.16, "grad_norm": 1.344625564952928, "learning_rate": 0.0009582607930016678, "loss": 4.0787, "step": 1376 }, { "epoch": 0.16, "grad_norm": 1.466095277304083, "learning_rate": 0.0009581864860325577, "loss": 4.0771, "step": 1377 }, { "epoch": 0.16, "grad_norm": 1.6972533508568106, "learning_rate": 0.0009581121158653558, "loss": 4.4571, "step": 1378 }, { "epoch": 0.16, "grad_norm": 1.0151707018999763, "learning_rate": 0.0009580376825103199, "loss": 3.9447, "step": 1379 }, { "epoch": 0.16, "grad_norm": 1.2359889393507282, "learning_rate": 0.0009579631859777167, "loss": 4.1392, "step": 1380 }, { "epoch": 0.16, "grad_norm": 1.390129554781176, "learning_rate": 0.0009578886262778214, "loss": 3.9477, "step": 1381 }, { "epoch": 0.16, "grad_norm": 1.8819248191093079, "learning_rate": 0.0009578140034209185, "loss": 3.9129, "step": 1382 }, { "epoch": 0.16, "grad_norm": 0.9660049665141721, "learning_rate": 0.0009577393174173004, "loss": 4.0252, "step": 1383 }, { "epoch": 0.16, "grad_norm": 1.5236901335154764, "learning_rate": 0.0009576645682772689, "loss": 4.0692, "step": 1384 }, { "epoch": 0.16, "grad_norm": 1.5774974085648614, "learning_rate": 0.0009575897560111339, "loss": 3.9528, "step": 1385 }, { "epoch": 0.16, "grad_norm": 1.060813847578125, "learning_rate": 0.0009575148806292146, "loss": 3.9163, "step": 1386 }, { "epoch": 0.16, "grad_norm": 1.5029674401254816, "learning_rate": 0.0009574399421418388, "loss": 3.9893, "step": 1387 }, { "epoch": 0.16, "grad_norm": 1.0226250239889845, "learning_rate": 0.0009573649405593422, "loss": 4.0171, "step": 1388 }, { "epoch": 0.16, "grad_norm": 1.3265762341738823, "learning_rate": 0.0009572898758920704, "loss": 3.6776, "step": 1389 }, { "epoch": 0.16, "grad_norm": 1.411756074290287, "learning_rate": 0.000957214748150377, "loss": 3.9708, "step": 1390 }, { "epoch": 0.16, "grad_norm": 1.3177589388017152, "learning_rate": 0.0009571395573446242, "loss": 4.0602, "step": 1391 }, { "epoch": 0.16, "grad_norm": 1.8237903989738922, "learning_rate": 0.0009570643034851835, "loss": 4.0576, "step": 1392 }, { "epoch": 0.16, "grad_norm": 1.0659137076191685, "learning_rate": 0.0009569889865824345, "loss": 4.275, "step": 1393 }, { "epoch": 0.16, "grad_norm": 2.022035073494475, "learning_rate": 0.0009569136066467659, "loss": 4.0776, "step": 1394 }, { "epoch": 0.16, "grad_norm": 1.314649056506147, "learning_rate": 0.0009568381636885747, "loss": 4.0085, "step": 1395 }, { "epoch": 0.16, "grad_norm": 1.18337631566822, "learning_rate": 0.0009567626577182671, "loss": 3.9921, "step": 1396 }, { "epoch": 0.16, "grad_norm": 1.2069287051734499, "learning_rate": 0.0009566870887462573, "loss": 3.9837, "step": 1397 }, { "epoch": 0.16, "grad_norm": 1.036276195694108, "learning_rate": 0.0009566114567829691, "loss": 3.7358, "step": 1398 }, { "epoch": 0.16, "grad_norm": 1.0262570940167501, "learning_rate": 0.0009565357618388342, "loss": 3.9536, "step": 1399 }, { "epoch": 0.16, "grad_norm": 1.2565237168968066, "learning_rate": 0.0009564600039242932, "loss": 3.9893, "step": 1400 }, { "epoch": 0.16, "grad_norm": 1.0171457456211854, "learning_rate": 0.0009563841830497957, "loss": 4.1678, "step": 1401 }, { "epoch": 0.16, "grad_norm": 1.5129820432649712, "learning_rate": 0.0009563082992257996, "loss": 4.0798, "step": 1402 }, { "epoch": 0.16, "grad_norm": 0.9574336342267478, "learning_rate": 0.0009562323524627716, "loss": 4.0426, "step": 1403 }, { "epoch": 0.16, "grad_norm": 1.1118635576925187, "learning_rate": 0.0009561563427711872, "loss": 4.0956, "step": 1404 }, { "epoch": 0.16, "grad_norm": 1.2142543968891881, "learning_rate": 0.0009560802701615304, "loss": 4.0106, "step": 1405 }, { "epoch": 0.16, "grad_norm": 1.5317853459037922, "learning_rate": 0.0009560041346442941, "loss": 3.8468, "step": 1406 }, { "epoch": 0.16, "grad_norm": 1.1614067748033492, "learning_rate": 0.0009559279362299796, "loss": 3.942, "step": 1407 }, { "epoch": 0.16, "grad_norm": 1.3810695709105454, "learning_rate": 0.0009558516749290971, "loss": 4.0907, "step": 1408 }, { "epoch": 0.16, "grad_norm": 1.3457468263687886, "learning_rate": 0.0009557753507521653, "loss": 3.8531, "step": 1409 }, { "epoch": 0.16, "grad_norm": 1.63141356503915, "learning_rate": 0.0009556989637097118, "loss": 3.9315, "step": 1410 }, { "epoch": 0.16, "grad_norm": 1.4784488670108862, "learning_rate": 0.0009556225138122727, "loss": 4.1192, "step": 1411 }, { "epoch": 0.16, "grad_norm": 1.0249702478378506, "learning_rate": 0.0009555460010703927, "loss": 3.9741, "step": 1412 }, { "epoch": 0.16, "grad_norm": 1.4211728014658338, "learning_rate": 0.0009554694254946252, "loss": 3.8414, "step": 1413 }, { "epoch": 0.16, "grad_norm": 0.9866506955889526, "learning_rate": 0.0009553927870955327, "loss": 3.8551, "step": 1414 }, { "epoch": 0.16, "grad_norm": 1.7214049010762735, "learning_rate": 0.0009553160858836858, "loss": 3.9022, "step": 1415 }, { "epoch": 0.16, "grad_norm": 1.682332185303543, "learning_rate": 0.000955239321869664, "loss": 3.9477, "step": 1416 }, { "epoch": 0.16, "grad_norm": 1.4127911167889537, "learning_rate": 0.0009551624950640552, "loss": 3.8182, "step": 1417 }, { "epoch": 0.16, "grad_norm": 1.1722705785289373, "learning_rate": 0.0009550856054774566, "loss": 3.9528, "step": 1418 }, { "epoch": 0.16, "grad_norm": 1.800955065290631, "learning_rate": 0.0009550086531204733, "loss": 4.1212, "step": 1419 }, { "epoch": 0.16, "grad_norm": 1.1951928094274147, "learning_rate": 0.0009549316380037196, "loss": 3.97, "step": 1420 }, { "epoch": 0.16, "grad_norm": 3.6389148124870534, "learning_rate": 0.0009548545601378183, "loss": 4.1494, "step": 1421 }, { "epoch": 0.16, "grad_norm": 1.0319715121163524, "learning_rate": 0.0009547774195334007, "loss": 4.0047, "step": 1422 }, { "epoch": 0.16, "grad_norm": 1.0826750676973744, "learning_rate": 0.0009547002162011069, "loss": 4.0859, "step": 1423 }, { "epoch": 0.16, "grad_norm": 8.38933576242107, "learning_rate": 0.0009546229501515856, "loss": 4.0042, "step": 1424 }, { "epoch": 0.16, "grad_norm": 1.9569309342371426, "learning_rate": 0.0009545456213954944, "loss": 4.084, "step": 1425 }, { "epoch": 0.16, "grad_norm": 1.073724926302089, "learning_rate": 0.000954468229943499, "loss": 4.1014, "step": 1426 }, { "epoch": 0.16, "grad_norm": 2.0921806686359132, "learning_rate": 0.0009543907758062742, "loss": 4.1638, "step": 1427 }, { "epoch": 0.16, "grad_norm": 1.0503874156942552, "learning_rate": 0.0009543132589945034, "loss": 3.9889, "step": 1428 }, { "epoch": 0.16, "grad_norm": 1.1240990513091371, "learning_rate": 0.0009542356795188786, "loss": 3.8498, "step": 1429 }, { "epoch": 0.16, "grad_norm": 3.693494680023195, "learning_rate": 0.0009541580373901002, "loss": 3.8564, "step": 1430 }, { "epoch": 0.16, "grad_norm": 2.683483017043802, "learning_rate": 0.0009540803326188777, "loss": 3.8212, "step": 1431 }, { "epoch": 0.16, "grad_norm": 1.3949108404727342, "learning_rate": 0.0009540025652159288, "loss": 4.0194, "step": 1432 }, { "epoch": 0.16, "grad_norm": 1.8551436620265145, "learning_rate": 0.0009539247351919802, "loss": 4.2805, "step": 1433 }, { "epoch": 0.16, "grad_norm": 1.6825292983552596, "learning_rate": 0.0009538468425577669, "loss": 4.0983, "step": 1434 }, { "epoch": 0.16, "grad_norm": 1.3707322205873425, "learning_rate": 0.0009537688873240327, "loss": 4.1492, "step": 1435 }, { "epoch": 0.16, "grad_norm": 1.4782172583497575, "learning_rate": 0.0009536908695015303, "loss": 4.0569, "step": 1436 }, { "epoch": 0.16, "grad_norm": 1.6267272380249376, "learning_rate": 0.0009536127891010205, "loss": 4.1551, "step": 1437 }, { "epoch": 0.16, "grad_norm": 2.633646921841173, "learning_rate": 0.000953534646133273, "loss": 3.94, "step": 1438 }, { "epoch": 0.17, "grad_norm": 1.070799945686044, "learning_rate": 0.0009534564406090664, "loss": 3.7803, "step": 1439 }, { "epoch": 0.17, "grad_norm": 1.0575204774196385, "learning_rate": 0.0009533781725391872, "loss": 4.0849, "step": 1440 }, { "epoch": 0.17, "grad_norm": 1.6377765339149584, "learning_rate": 0.0009532998419344316, "loss": 4.0582, "step": 1441 }, { "epoch": 0.17, "grad_norm": 1.2048656291910675, "learning_rate": 0.0009532214488056032, "loss": 3.9493, "step": 1442 }, { "epoch": 0.17, "grad_norm": 1.2767895806681644, "learning_rate": 0.0009531429931635154, "loss": 4.0299, "step": 1443 }, { "epoch": 0.17, "grad_norm": 1.2091484154595902, "learning_rate": 0.0009530644750189892, "loss": 4.2386, "step": 1444 }, { "epoch": 0.17, "grad_norm": 1.146026370776332, "learning_rate": 0.0009529858943828548, "loss": 3.775, "step": 1445 }, { "epoch": 0.17, "grad_norm": 1.117594790290509, "learning_rate": 0.0009529072512659512, "loss": 3.8195, "step": 1446 }, { "epoch": 0.17, "grad_norm": 1.1097512981615767, "learning_rate": 0.0009528285456791253, "loss": 4.1377, "step": 1447 }, { "epoch": 0.17, "grad_norm": 1.2171537299763473, "learning_rate": 0.0009527497776332334, "loss": 4.0768, "step": 1448 }, { "epoch": 0.17, "grad_norm": 1.09787891611974, "learning_rate": 0.0009526709471391397, "loss": 4.0925, "step": 1449 }, { "epoch": 0.17, "grad_norm": 1.3004857317088294, "learning_rate": 0.0009525920542077176, "loss": 4.105, "step": 1450 }, { "epoch": 0.17, "grad_norm": 1.087158981492556, "learning_rate": 0.0009525130988498489, "loss": 3.9095, "step": 1451 }, { "epoch": 0.17, "grad_norm": 1.0988472052864304, "learning_rate": 0.0009524340810764237, "loss": 4.1146, "step": 1452 }, { "epoch": 0.17, "grad_norm": 1.2173233333357378, "learning_rate": 0.0009523550008983413, "loss": 4.0295, "step": 1453 }, { "epoch": 0.17, "grad_norm": 1.2138955239248983, "learning_rate": 0.0009522758583265092, "loss": 3.9534, "step": 1454 }, { "epoch": 0.17, "grad_norm": 1.303078641962124, "learning_rate": 0.0009521966533718436, "loss": 4.0919, "step": 1455 }, { "epoch": 0.17, "grad_norm": 1.2206624908822545, "learning_rate": 0.0009521173860452695, "loss": 3.8942, "step": 1456 }, { "epoch": 0.17, "grad_norm": 1.1002820294500268, "learning_rate": 0.0009520380563577198, "loss": 3.9093, "step": 1457 }, { "epoch": 0.17, "grad_norm": 1.901503834963222, "learning_rate": 0.000951958664320137, "loss": 3.9956, "step": 1458 }, { "epoch": 0.17, "grad_norm": 1.1179821195490596, "learning_rate": 0.0009518792099434717, "loss": 4.159, "step": 1459 }, { "epoch": 0.17, "grad_norm": 2.5118820919125864, "learning_rate": 0.0009517996932386827, "loss": 3.9963, "step": 1460 }, { "epoch": 0.17, "grad_norm": 2.499617765960411, "learning_rate": 0.0009517201142167385, "loss": 4.0515, "step": 1461 }, { "epoch": 0.17, "grad_norm": 1.0389901721025143, "learning_rate": 0.0009516404728886148, "loss": 4.1209, "step": 1462 }, { "epoch": 0.17, "grad_norm": 1.4212175399574847, "learning_rate": 0.000951560769265297, "loss": 3.8276, "step": 1463 }, { "epoch": 0.17, "grad_norm": 1.8441758886908661, "learning_rate": 0.0009514810033577786, "loss": 3.7736, "step": 1464 }, { "epoch": 0.17, "grad_norm": 1.10172755134985, "learning_rate": 0.0009514011751770618, "loss": 4.0846, "step": 1465 }, { "epoch": 0.17, "grad_norm": 1.0085105015042906, "learning_rate": 0.0009513212847341573, "loss": 3.9577, "step": 1466 }, { "epoch": 0.17, "grad_norm": 1.1143872289549588, "learning_rate": 0.0009512413320400847, "loss": 3.8455, "step": 1467 }, { "epoch": 0.17, "grad_norm": 0.9873537784957335, "learning_rate": 0.0009511613171058717, "loss": 3.947, "step": 1468 }, { "epoch": 0.17, "grad_norm": 1.05229355481507, "learning_rate": 0.000951081239942555, "loss": 3.9721, "step": 1469 }, { "epoch": 0.17, "grad_norm": 4.716995287450864, "learning_rate": 0.0009510011005611796, "loss": 3.9344, "step": 1470 }, { "epoch": 0.17, "grad_norm": 5.273042864250928, "learning_rate": 0.0009509208989727992, "loss": 4.076, "step": 1471 }, { "epoch": 0.17, "grad_norm": 2.010145491511432, "learning_rate": 0.000950840635188476, "loss": 4.1398, "step": 1472 }, { "epoch": 0.17, "grad_norm": 1.007199636200481, "learning_rate": 0.0009507603092192812, "loss": 3.9969, "step": 1473 }, { "epoch": 0.17, "grad_norm": 1.7894381271237387, "learning_rate": 0.000950679921076294, "loss": 4.2724, "step": 1474 }, { "epoch": 0.17, "grad_norm": 10.429603463402934, "learning_rate": 0.0009505994707706023, "loss": 4.0447, "step": 1475 }, { "epoch": 0.17, "grad_norm": 1.1508940953472522, "learning_rate": 0.000950518958313303, "loss": 4.2006, "step": 1476 }, { "epoch": 0.17, "grad_norm": 1.4096763103907752, "learning_rate": 0.0009504383837155008, "loss": 4.0077, "step": 1477 }, { "epoch": 0.17, "grad_norm": 4.518693490137716, "learning_rate": 0.0009503577469883098, "loss": 3.931, "step": 1478 }, { "epoch": 0.17, "grad_norm": 1.042024985823802, "learning_rate": 0.0009502770481428525, "loss": 4.0088, "step": 1479 }, { "epoch": 0.17, "grad_norm": 2.0405495778312255, "learning_rate": 0.000950196287190259, "loss": 4.0072, "step": 1480 }, { "epoch": 0.17, "grad_norm": 2.7010059022544026, "learning_rate": 0.0009501154641416695, "loss": 4.2255, "step": 1481 }, { "epoch": 0.17, "grad_norm": 2.5761651038558804, "learning_rate": 0.0009500345790082317, "loss": 4.0403, "step": 1482 }, { "epoch": 0.17, "grad_norm": 1.0758688108059402, "learning_rate": 0.0009499536318011019, "loss": 3.9496, "step": 1483 }, { "epoch": 0.17, "grad_norm": 1.5406366019949669, "learning_rate": 0.0009498726225314458, "loss": 4.188, "step": 1484 }, { "epoch": 0.17, "grad_norm": 3.51484401010663, "learning_rate": 0.0009497915512104367, "loss": 4.1759, "step": 1485 }, { "epoch": 0.17, "grad_norm": 1.2954690422363475, "learning_rate": 0.0009497104178492568, "loss": 3.986, "step": 1486 }, { "epoch": 0.17, "grad_norm": 1.1581952459398834, "learning_rate": 0.0009496292224590973, "loss": 3.9818, "step": 1487 }, { "epoch": 0.17, "grad_norm": 1.6418172941756257, "learning_rate": 0.000949547965051157, "loss": 4.1883, "step": 1488 }, { "epoch": 0.17, "grad_norm": 1.1196241061766323, "learning_rate": 0.0009494666456366441, "loss": 3.8681, "step": 1489 }, { "epoch": 0.17, "grad_norm": 1.5419910360689664, "learning_rate": 0.0009493852642267751, "loss": 4.323, "step": 1490 }, { "epoch": 0.17, "grad_norm": 1.4587071478854854, "learning_rate": 0.0009493038208327749, "loss": 3.9006, "step": 1491 }, { "epoch": 0.17, "grad_norm": 1.1875677893518994, "learning_rate": 0.0009492223154658773, "loss": 4.1079, "step": 1492 }, { "epoch": 0.17, "grad_norm": 1.2143493993865064, "learning_rate": 0.0009491407481373241, "loss": 4.0163, "step": 1493 }, { "epoch": 0.17, "grad_norm": 1.0189891447686548, "learning_rate": 0.0009490591188583661, "loss": 4.0038, "step": 1494 }, { "epoch": 0.17, "grad_norm": 1.0115448073831566, "learning_rate": 0.0009489774276402625, "loss": 3.7444, "step": 1495 }, { "epoch": 0.17, "grad_norm": 3.32654957277159, "learning_rate": 0.0009488956744942811, "loss": 3.9377, "step": 1496 }, { "epoch": 0.17, "grad_norm": 1.299342429802708, "learning_rate": 0.0009488138594316982, "loss": 3.8757, "step": 1497 }, { "epoch": 0.17, "grad_norm": 2.4259709599093267, "learning_rate": 0.0009487319824637983, "loss": 4.0633, "step": 1498 }, { "epoch": 0.17, "grad_norm": 1.1590574888539504, "learning_rate": 0.0009486500436018752, "loss": 4.3648, "step": 1499 }, { "epoch": 0.17, "grad_norm": 1.2496521872618813, "learning_rate": 0.0009485680428572308, "loss": 4.0798, "step": 1500 }, { "epoch": 0.17, "grad_norm": 1.781842520581093, "learning_rate": 0.0009484859802411751, "loss": 4.0273, "step": 1501 }, { "epoch": 0.17, "grad_norm": 1.301002208899886, "learning_rate": 0.0009484038557650274, "loss": 4.0523, "step": 1502 }, { "epoch": 0.17, "grad_norm": 2.2463540747939885, "learning_rate": 0.0009483216694401152, "loss": 3.8605, "step": 1503 }, { "epoch": 0.17, "grad_norm": 2.0099618971124995, "learning_rate": 0.0009482394212777745, "loss": 3.9779, "step": 1504 }, { "epoch": 0.17, "grad_norm": 2.18745657540292, "learning_rate": 0.0009481571112893498, "loss": 4.3322, "step": 1505 }, { "epoch": 0.17, "grad_norm": 1.5194582325827022, "learning_rate": 0.0009480747394861944, "loss": 3.9104, "step": 1506 }, { "epoch": 0.17, "grad_norm": 1.6263132021317614, "learning_rate": 0.0009479923058796695, "loss": 4.0152, "step": 1507 }, { "epoch": 0.17, "grad_norm": 1.322554396350308, "learning_rate": 0.0009479098104811459, "loss": 4.2256, "step": 1508 }, { "epoch": 0.17, "grad_norm": 1.070602569284558, "learning_rate": 0.0009478272533020016, "loss": 4.0317, "step": 1509 }, { "epoch": 0.17, "grad_norm": 1.8130198873192451, "learning_rate": 0.0009477446343536241, "loss": 4.0511, "step": 1510 }, { "epoch": 0.17, "grad_norm": 1.3264760664406752, "learning_rate": 0.0009476619536474091, "loss": 4.0548, "step": 1511 }, { "epoch": 0.17, "grad_norm": 3.3787143154152606, "learning_rate": 0.0009475792111947607, "loss": 4.1523, "step": 1512 }, { "epoch": 0.17, "grad_norm": 1.3386293055771301, "learning_rate": 0.0009474964070070919, "loss": 4.2017, "step": 1513 }, { "epoch": 0.17, "grad_norm": 2.6121294939588964, "learning_rate": 0.0009474135410958239, "loss": 3.9409, "step": 1514 }, { "epoch": 0.17, "grad_norm": 1.0522877091215437, "learning_rate": 0.0009473306134723862, "loss": 4.0148, "step": 1515 }, { "epoch": 0.17, "grad_norm": 1.1236124263096805, "learning_rate": 0.0009472476241482173, "loss": 3.9441, "step": 1516 }, { "epoch": 0.17, "grad_norm": 1.0395820595564298, "learning_rate": 0.000947164573134764, "loss": 4.125, "step": 1517 }, { "epoch": 0.17, "grad_norm": 1.3952507892784816, "learning_rate": 0.0009470814604434816, "loss": 4.0103, "step": 1518 }, { "epoch": 0.17, "grad_norm": 1.3229866871156335, "learning_rate": 0.000946998286085834, "loss": 3.8555, "step": 1519 }, { "epoch": 0.17, "grad_norm": 1.1261632556758876, "learning_rate": 0.0009469150500732932, "loss": 4.0141, "step": 1520 }, { "epoch": 0.17, "grad_norm": 1.3470085353259682, "learning_rate": 0.0009468317524173402, "loss": 4.3003, "step": 1521 }, { "epoch": 0.17, "grad_norm": 1.2139594995307732, "learning_rate": 0.0009467483931294644, "loss": 4.0957, "step": 1522 }, { "epoch": 0.17, "grad_norm": 1.3417697007210192, "learning_rate": 0.0009466649722211635, "loss": 4.0629, "step": 1523 }, { "epoch": 0.17, "grad_norm": 1.3646457482618009, "learning_rate": 0.000946581489703944, "loss": 4.0718, "step": 1524 }, { "epoch": 0.17, "grad_norm": 1.2202136376915413, "learning_rate": 0.0009464979455893205, "loss": 4.05, "step": 1525 }, { "epoch": 0.17, "grad_norm": 1.0969669820645596, "learning_rate": 0.0009464143398888166, "loss": 4.0512, "step": 1526 }, { "epoch": 0.18, "grad_norm": 1.2589843752132026, "learning_rate": 0.0009463306726139638, "loss": 4.0144, "step": 1527 }, { "epoch": 0.18, "grad_norm": 1.1233362765571688, "learning_rate": 0.0009462469437763026, "loss": 4.0589, "step": 1528 }, { "epoch": 0.18, "grad_norm": 1.4580203295134564, "learning_rate": 0.0009461631533873818, "loss": 4.1124, "step": 1529 }, { "epoch": 0.18, "grad_norm": 1.2194636616978891, "learning_rate": 0.0009460793014587585, "loss": 4.0288, "step": 1530 }, { "epoch": 0.18, "grad_norm": 1.038005454730143, "learning_rate": 0.0009459953880019987, "loss": 4.2691, "step": 1531 }, { "epoch": 0.18, "grad_norm": 1.772115518169477, "learning_rate": 0.0009459114130286766, "loss": 3.9624, "step": 1532 }, { "epoch": 0.18, "grad_norm": 1.2680884978465021, "learning_rate": 0.0009458273765503749, "loss": 3.9254, "step": 1533 }, { "epoch": 0.18, "grad_norm": 1.2746864359018644, "learning_rate": 0.0009457432785786848, "loss": 4.077, "step": 1534 }, { "epoch": 0.18, "grad_norm": 1.3786959998338246, "learning_rate": 0.0009456591191252061, "loss": 3.9734, "step": 1535 }, { "epoch": 0.18, "grad_norm": 1.1534231512005737, "learning_rate": 0.0009455748982015468, "loss": 4.2007, "step": 1536 }, { "epoch": 0.18, "grad_norm": 1.3134784830948674, "learning_rate": 0.0009454906158193239, "loss": 4.0669, "step": 1537 }, { "epoch": 0.18, "grad_norm": 1.358311015308165, "learning_rate": 0.0009454062719901624, "loss": 3.8458, "step": 1538 }, { "epoch": 0.18, "grad_norm": 1.000640124641988, "learning_rate": 0.0009453218667256958, "loss": 4.1056, "step": 1539 }, { "epoch": 0.18, "grad_norm": 1.564345431568382, "learning_rate": 0.0009452374000375664, "loss": 3.8133, "step": 1540 }, { "epoch": 0.18, "grad_norm": 1.2102795659137715, "learning_rate": 0.0009451528719374245, "loss": 3.9695, "step": 1541 }, { "epoch": 0.18, "grad_norm": 1.1673702199370906, "learning_rate": 0.0009450682824369294, "loss": 4.2753, "step": 1542 }, { "epoch": 0.18, "grad_norm": 1.0946209557587485, "learning_rate": 0.0009449836315477485, "loss": 4.1374, "step": 1543 }, { "epoch": 0.18, "grad_norm": 1.9037035990407962, "learning_rate": 0.0009448989192815578, "loss": 3.9875, "step": 1544 }, { "epoch": 0.18, "grad_norm": 1.083381287749781, "learning_rate": 0.0009448141456500416, "loss": 3.8124, "step": 1545 }, { "epoch": 0.18, "grad_norm": 1.1162191036520939, "learning_rate": 0.0009447293106648931, "loss": 3.9731, "step": 1546 }, { "epoch": 0.18, "grad_norm": 1.139527199053215, "learning_rate": 0.0009446444143378134, "loss": 3.9062, "step": 1547 }, { "epoch": 0.18, "grad_norm": 1.6667585096505606, "learning_rate": 0.0009445594566805126, "loss": 4.0454, "step": 1548 }, { "epoch": 0.18, "grad_norm": 1.360271243123121, "learning_rate": 0.0009444744377047088, "loss": 3.9364, "step": 1549 }, { "epoch": 0.18, "grad_norm": 4.05461583445939, "learning_rate": 0.0009443893574221286, "loss": 4.0601, "step": 1550 }, { "epoch": 0.18, "grad_norm": 0.9990522960201313, "learning_rate": 0.0009443042158445074, "loss": 3.9954, "step": 1551 }, { "epoch": 0.18, "grad_norm": 1.7606335555696202, "learning_rate": 0.000944219012983589, "loss": 3.9506, "step": 1552 }, { "epoch": 0.18, "grad_norm": 1.0150806300182105, "learning_rate": 0.0009441337488511252, "loss": 4.0746, "step": 1553 }, { "epoch": 0.18, "grad_norm": 1.8671079046328658, "learning_rate": 0.0009440484234588766, "loss": 4.1335, "step": 1554 }, { "epoch": 0.18, "grad_norm": 1.1387331699205696, "learning_rate": 0.0009439630368186125, "loss": 3.9632, "step": 1555 }, { "epoch": 0.18, "grad_norm": 1.0802840881191922, "learning_rate": 0.0009438775889421102, "loss": 4.0763, "step": 1556 }, { "epoch": 0.18, "grad_norm": 1.0477878889048695, "learning_rate": 0.0009437920798411554, "loss": 4.2117, "step": 1557 }, { "epoch": 0.18, "grad_norm": 1.020360844478139, "learning_rate": 0.0009437065095275429, "loss": 4.0058, "step": 1558 }, { "epoch": 0.18, "grad_norm": 1.271539788773528, "learning_rate": 0.0009436208780130751, "loss": 3.9709, "step": 1559 }, { "epoch": 0.18, "grad_norm": 1.2325793476806803, "learning_rate": 0.0009435351853095633, "loss": 4.2614, "step": 1560 }, { "epoch": 0.18, "grad_norm": 1.5129074206691389, "learning_rate": 0.0009434494314288273, "loss": 3.9118, "step": 1561 }, { "epoch": 0.18, "grad_norm": 1.0852267725667444, "learning_rate": 0.0009433636163826951, "loss": 3.9656, "step": 1562 }, { "epoch": 0.18, "grad_norm": 2.31211843316515, "learning_rate": 0.0009432777401830033, "loss": 4.0632, "step": 1563 }, { "epoch": 0.18, "grad_norm": 1.2201066335760025, "learning_rate": 0.0009431918028415969, "loss": 4.1012, "step": 1564 }, { "epoch": 0.18, "grad_norm": 1.018070076924353, "learning_rate": 0.0009431058043703293, "loss": 3.7451, "step": 1565 }, { "epoch": 0.18, "grad_norm": 1.9877504433243554, "learning_rate": 0.0009430197447810625, "loss": 4.0557, "step": 1566 }, { "epoch": 0.18, "grad_norm": 1.796630409666157, "learning_rate": 0.0009429336240856662, "loss": 4.1918, "step": 1567 }, { "epoch": 0.18, "grad_norm": 0.9549503242484726, "learning_rate": 0.00094284744229602, "loss": 4.0311, "step": 1568 }, { "epoch": 0.18, "grad_norm": 1.4207559380371706, "learning_rate": 0.0009427611994240104, "loss": 4.0464, "step": 1569 }, { "epoch": 0.18, "grad_norm": 1.4423883401927506, "learning_rate": 0.0009426748954815332, "loss": 4.0795, "step": 1570 }, { "epoch": 0.18, "grad_norm": 1.2670034974945652, "learning_rate": 0.0009425885304804922, "loss": 3.755, "step": 1571 }, { "epoch": 0.18, "grad_norm": 1.0930648474783344, "learning_rate": 0.0009425021044328, "loss": 4.0235, "step": 1572 }, { "epoch": 0.18, "grad_norm": 1.4648091635087739, "learning_rate": 0.0009424156173503772, "loss": 4.2862, "step": 1573 }, { "epoch": 0.18, "grad_norm": 1.3294508992598661, "learning_rate": 0.0009423290692451534, "loss": 4.334, "step": 1574 }, { "epoch": 0.18, "grad_norm": 3.5529987163556482, "learning_rate": 0.000942242460129066, "loss": 4.1119, "step": 1575 }, { "epoch": 0.18, "grad_norm": 28.342814929865067, "learning_rate": 0.0009421557900140612, "loss": 4.2338, "step": 1576 }, { "epoch": 0.18, "grad_norm": 1.401753450309854, "learning_rate": 0.0009420690589120932, "loss": 3.988, "step": 1577 }, { "epoch": 0.18, "grad_norm": 1.3152754829611297, "learning_rate": 0.0009419822668351255, "loss": 3.6421, "step": 1578 }, { "epoch": 0.18, "grad_norm": 2.3821217334720153, "learning_rate": 0.0009418954137951288, "loss": 3.9052, "step": 1579 }, { "epoch": 0.18, "grad_norm": 1.4287936380073032, "learning_rate": 0.000941808499804083, "loss": 4.026, "step": 1580 }, { "epoch": 0.18, "grad_norm": 1.589741411118457, "learning_rate": 0.0009417215248739764, "loss": 4.0988, "step": 1581 }, { "epoch": 0.18, "grad_norm": 3.881527201007527, "learning_rate": 0.0009416344890168054, "loss": 3.9948, "step": 1582 }, { "epoch": 0.18, "grad_norm": 2.5871234228385585, "learning_rate": 0.000941547392244575, "loss": 4.0763, "step": 1583 }, { "epoch": 0.18, "grad_norm": 1.3901938708317705, "learning_rate": 0.0009414602345692984, "loss": 3.8871, "step": 1584 }, { "epoch": 0.18, "grad_norm": 1.2759141511285401, "learning_rate": 0.0009413730160029974, "loss": 4.0727, "step": 1585 }, { "epoch": 0.18, "grad_norm": 1.6960500482537495, "learning_rate": 0.0009412857365577023, "loss": 4.2371, "step": 1586 }, { "epoch": 0.18, "grad_norm": 2.687263299408962, "learning_rate": 0.0009411983962454515, "loss": 4.031, "step": 1587 }, { "epoch": 0.18, "grad_norm": 1.619550097512104, "learning_rate": 0.0009411109950782919, "loss": 3.9088, "step": 1588 }, { "epoch": 0.18, "grad_norm": 1.1520579066673606, "learning_rate": 0.0009410235330682788, "loss": 3.9316, "step": 1589 }, { "epoch": 0.18, "grad_norm": 1.2928703635728427, "learning_rate": 0.0009409360102274761, "loss": 3.9791, "step": 1590 }, { "epoch": 0.18, "grad_norm": 1.4975000849410067, "learning_rate": 0.0009408484265679558, "loss": 3.9566, "step": 1591 }, { "epoch": 0.18, "grad_norm": 1.4290780670591094, "learning_rate": 0.0009407607821017983, "loss": 3.9949, "step": 1592 }, { "epoch": 0.18, "grad_norm": 1.2538358419918487, "learning_rate": 0.0009406730768410927, "loss": 4.0219, "step": 1593 }, { "epoch": 0.18, "grad_norm": 1.4509249131459243, "learning_rate": 0.0009405853107979361, "loss": 3.9853, "step": 1594 }, { "epoch": 0.18, "grad_norm": 2.4371874016745227, "learning_rate": 0.0009404974839844341, "loss": 4.0851, "step": 1595 }, { "epoch": 0.18, "grad_norm": 2.0103357929397667, "learning_rate": 0.0009404095964127008, "loss": 4.091, "step": 1596 }, { "epoch": 0.18, "grad_norm": 1.3862863838808825, "learning_rate": 0.0009403216480948589, "loss": 4.0673, "step": 1597 }, { "epoch": 0.18, "grad_norm": 3.7059047065659514, "learning_rate": 0.0009402336390430388, "loss": 3.9567, "step": 1598 }, { "epoch": 0.18, "grad_norm": 2.513612040559258, "learning_rate": 0.0009401455692693798, "loss": 4.0499, "step": 1599 }, { "epoch": 0.18, "grad_norm": 1.2563832648353617, "learning_rate": 0.0009400574387860294, "loss": 4.2083, "step": 1600 }, { "epoch": 0.18, "grad_norm": 1.0538875703872752, "learning_rate": 0.0009399692476051436, "loss": 3.946, "step": 1601 }, { "epoch": 0.18, "grad_norm": 1.3157696139264001, "learning_rate": 0.0009398809957388868, "loss": 4.0715, "step": 1602 }, { "epoch": 0.18, "grad_norm": 2.9565022421784732, "learning_rate": 0.0009397926831994314, "loss": 4.1437, "step": 1603 }, { "epoch": 0.18, "grad_norm": 1.2550727821854897, "learning_rate": 0.0009397043099989587, "loss": 3.9431, "step": 1604 }, { "epoch": 0.18, "grad_norm": 1.2138049868893723, "learning_rate": 0.0009396158761496577, "loss": 4.0339, "step": 1605 }, { "epoch": 0.18, "grad_norm": 1.8011877941976986, "learning_rate": 0.0009395273816637267, "loss": 4.125, "step": 1606 }, { "epoch": 0.18, "grad_norm": 0.9954549094777444, "learning_rate": 0.0009394388265533713, "loss": 4.1493, "step": 1607 }, { "epoch": 0.18, "grad_norm": 1.2425507722540214, "learning_rate": 0.0009393502108308064, "loss": 4.0795, "step": 1608 }, { "epoch": 0.18, "grad_norm": 1.5196308037672943, "learning_rate": 0.0009392615345082547, "loss": 4.0279, "step": 1609 }, { "epoch": 0.18, "grad_norm": 1.3749328710618927, "learning_rate": 0.0009391727975979474, "loss": 4.1508, "step": 1610 }, { "epoch": 0.18, "grad_norm": 1.4216759260195444, "learning_rate": 0.0009390840001121239, "loss": 3.9823, "step": 1611 }, { "epoch": 0.18, "grad_norm": 1.6269444980410777, "learning_rate": 0.0009389951420630325, "loss": 3.9962, "step": 1612 }, { "epoch": 0.18, "grad_norm": 1.0262761873546005, "learning_rate": 0.0009389062234629292, "loss": 3.9585, "step": 1613 }, { "epoch": 0.19, "grad_norm": 1.2330421214462208, "learning_rate": 0.0009388172443240788, "loss": 3.9964, "step": 1614 }, { "epoch": 0.19, "grad_norm": 2.2364422863017177, "learning_rate": 0.0009387282046587539, "loss": 3.8009, "step": 1615 }, { "epoch": 0.19, "grad_norm": 1.2938754038113647, "learning_rate": 0.0009386391044792363, "loss": 4.0828, "step": 1616 }, { "epoch": 0.19, "grad_norm": 1.714197619867434, "learning_rate": 0.0009385499437978153, "loss": 3.9215, "step": 1617 }, { "epoch": 0.19, "grad_norm": 1.0263523474657201, "learning_rate": 0.0009384607226267891, "loss": 4.1643, "step": 1618 }, { "epoch": 0.19, "grad_norm": 1.0495324794160328, "learning_rate": 0.0009383714409784643, "loss": 3.7803, "step": 1619 }, { "epoch": 0.19, "grad_norm": 1.3151827680799746, "learning_rate": 0.000938282098865155, "loss": 3.9883, "step": 1620 }, { "epoch": 0.19, "grad_norm": 1.23278454118889, "learning_rate": 0.0009381926962991847, "loss": 4.0857, "step": 1621 }, { "epoch": 0.19, "grad_norm": 1.0767898160242528, "learning_rate": 0.0009381032332928847, "loss": 3.8981, "step": 1622 }, { "epoch": 0.19, "grad_norm": 2.546061243594486, "learning_rate": 0.0009380137098585946, "loss": 4.1441, "step": 1623 }, { "epoch": 0.19, "grad_norm": 1.5108368308240396, "learning_rate": 0.0009379241260086626, "loss": 4.0278, "step": 1624 }, { "epoch": 0.19, "grad_norm": 0.9095612343202243, "learning_rate": 0.0009378344817554449, "loss": 3.9521, "step": 1625 }, { "epoch": 0.19, "grad_norm": 1.0140101542492994, "learning_rate": 0.0009377447771113065, "loss": 4.0056, "step": 1626 }, { "epoch": 0.19, "grad_norm": 1.2126823760900876, "learning_rate": 0.0009376550120886203, "loss": 4.0017, "step": 1627 }, { "epoch": 0.19, "grad_norm": 1.136129557986805, "learning_rate": 0.0009375651866997674, "loss": 3.851, "step": 1628 }, { "epoch": 0.19, "grad_norm": 2.4730578445048015, "learning_rate": 0.0009374753009571379, "loss": 4.0367, "step": 1629 }, { "epoch": 0.19, "grad_norm": 1.0455214703393232, "learning_rate": 0.0009373853548731297, "loss": 3.9871, "step": 1630 }, { "epoch": 0.19, "grad_norm": 2.7352257679578376, "learning_rate": 0.000937295348460149, "loss": 4.2117, "step": 1631 }, { "epoch": 0.19, "grad_norm": 1.6231944137494725, "learning_rate": 0.0009372052817306106, "loss": 3.9737, "step": 1632 }, { "epoch": 0.19, "grad_norm": 1.4506703456139152, "learning_rate": 0.0009371151546969376, "loss": 3.8869, "step": 1633 }, { "epoch": 0.19, "grad_norm": 1.1328952703999986, "learning_rate": 0.0009370249673715611, "loss": 4.0657, "step": 1634 }, { "epoch": 0.19, "grad_norm": 1.2211798173500854, "learning_rate": 0.0009369347197669207, "loss": 4.0564, "step": 1635 }, { "epoch": 0.19, "grad_norm": 1.105520019448545, "learning_rate": 0.0009368444118954646, "loss": 4.0072, "step": 1636 }, { "epoch": 0.19, "grad_norm": 1.103721639449375, "learning_rate": 0.0009367540437696489, "loss": 4.036, "step": 1637 }, { "epoch": 0.19, "grad_norm": 0.9829757397978973, "learning_rate": 0.0009366636154019381, "loss": 4.1445, "step": 1638 }, { "epoch": 0.19, "grad_norm": 1.126405591642974, "learning_rate": 0.0009365731268048052, "loss": 4.1211, "step": 1639 }, { "epoch": 0.19, "grad_norm": 1.796136183633084, "learning_rate": 0.0009364825779907311, "loss": 4.0849, "step": 1640 }, { "epoch": 0.19, "grad_norm": 0.8616409234491863, "learning_rate": 0.0009363919689722056, "loss": 3.8996, "step": 1641 }, { "epoch": 0.19, "grad_norm": 1.0690148869023086, "learning_rate": 0.0009363012997617264, "loss": 3.9092, "step": 1642 }, { "epoch": 0.19, "grad_norm": 1.0930841452268452, "learning_rate": 0.0009362105703717994, "loss": 4.0064, "step": 1643 }, { "epoch": 0.19, "grad_norm": 0.9749590076962155, "learning_rate": 0.0009361197808149393, "loss": 4.0487, "step": 1644 }, { "epoch": 0.19, "grad_norm": 1.6268266854158784, "learning_rate": 0.0009360289311036688, "loss": 3.9739, "step": 1645 }, { "epoch": 0.19, "grad_norm": 1.1612526474279157, "learning_rate": 0.0009359380212505184, "loss": 3.9588, "step": 1646 }, { "epoch": 0.19, "grad_norm": 1.243950413133019, "learning_rate": 0.0009358470512680278, "loss": 3.935, "step": 1647 }, { "epoch": 0.19, "grad_norm": 0.9213094881302677, "learning_rate": 0.0009357560211687445, "loss": 4.0706, "step": 1648 }, { "epoch": 0.19, "grad_norm": 1.6467514034858286, "learning_rate": 0.0009356649309652243, "loss": 4.1422, "step": 1649 }, { "epoch": 0.19, "grad_norm": 1.909505202059231, "learning_rate": 0.0009355737806700315, "loss": 3.9411, "step": 1650 }, { "epoch": 0.19, "grad_norm": 1.0671083977834006, "learning_rate": 0.0009354825702957383, "loss": 4.0571, "step": 1651 }, { "epoch": 0.19, "grad_norm": 1.0985723661883144, "learning_rate": 0.0009353912998549259, "loss": 3.882, "step": 1652 }, { "epoch": 0.19, "grad_norm": 4.162487433797309, "learning_rate": 0.0009352999693601827, "loss": 3.8339, "step": 1653 }, { "epoch": 0.19, "grad_norm": 1.0873001189646185, "learning_rate": 0.0009352085788241064, "loss": 4.0041, "step": 1654 }, { "epoch": 0.19, "grad_norm": 1.4421927954874678, "learning_rate": 0.0009351171282593026, "loss": 3.9875, "step": 1655 }, { "epoch": 0.19, "grad_norm": 1.1813924407814649, "learning_rate": 0.0009350256176783847, "loss": 3.7217, "step": 1656 }, { "epoch": 0.19, "grad_norm": 1.3464189196839678, "learning_rate": 0.0009349340470939753, "loss": 4.0857, "step": 1657 }, { "epoch": 0.19, "grad_norm": 2.082642538327698, "learning_rate": 0.0009348424165187049, "loss": 4.0625, "step": 1658 }, { "epoch": 0.19, "grad_norm": 1.3487157868958801, "learning_rate": 0.0009347507259652119, "loss": 4.1028, "step": 1659 }, { "epoch": 0.19, "grad_norm": 1.7121205024701807, "learning_rate": 0.0009346589754461433, "loss": 4.1058, "step": 1660 }, { "epoch": 0.19, "grad_norm": 1.2906840129301675, "learning_rate": 0.0009345671649741545, "loss": 4.0283, "step": 1661 }, { "epoch": 0.19, "grad_norm": 0.9990118829920177, "learning_rate": 0.0009344752945619089, "loss": 3.6723, "step": 1662 }, { "epoch": 0.19, "grad_norm": 1.3008971277686698, "learning_rate": 0.0009343833642220781, "loss": 4.0056, "step": 1663 }, { "epoch": 0.19, "grad_norm": 1.3721858263754008, "learning_rate": 0.0009342913739673424, "loss": 4.1186, "step": 1664 }, { "epoch": 0.19, "grad_norm": 2.0812505425602588, "learning_rate": 0.00093419932381039, "loss": 4.1793, "step": 1665 }, { "epoch": 0.19, "grad_norm": 1.1388367603316079, "learning_rate": 0.0009341072137639175, "loss": 3.9018, "step": 1666 }, { "epoch": 0.19, "grad_norm": 1.0690072441349499, "learning_rate": 0.0009340150438406296, "loss": 3.9003, "step": 1667 }, { "epoch": 0.19, "grad_norm": 4.735331686802502, "learning_rate": 0.0009339228140532396, "loss": 3.8626, "step": 1668 }, { "epoch": 0.19, "grad_norm": 4.318381624926373, "learning_rate": 0.0009338305244144687, "loss": 4.119, "step": 1669 }, { "epoch": 0.19, "grad_norm": 1.114446825991276, "learning_rate": 0.0009337381749370463, "loss": 4.2859, "step": 1670 }, { "epoch": 0.19, "grad_norm": 1.251597541938194, "learning_rate": 0.0009336457656337108, "loss": 4.1579, "step": 1671 }, { "epoch": 0.19, "grad_norm": 20.5683775033926, "learning_rate": 0.0009335532965172079, "loss": 4.172, "step": 1672 }, { "epoch": 0.19, "grad_norm": 1.661225405973415, "learning_rate": 0.0009334607676002919, "loss": 4.0344, "step": 1673 }, { "epoch": 0.19, "grad_norm": 1.0096937056465358, "learning_rate": 0.0009333681788957256, "loss": 3.7071, "step": 1674 }, { "epoch": 0.19, "grad_norm": 0.973734933008646, "learning_rate": 0.0009332755304162798, "loss": 3.9592, "step": 1675 }, { "epoch": 0.19, "grad_norm": 1.197317498335497, "learning_rate": 0.0009331828221747335, "loss": 3.9507, "step": 1676 }, { "epoch": 0.19, "grad_norm": 1.23811015892529, "learning_rate": 0.0009330900541838741, "loss": 4.1988, "step": 1677 }, { "epoch": 0.19, "grad_norm": 1.7788590119693857, "learning_rate": 0.0009329972264564972, "loss": 4.2852, "step": 1678 }, { "epoch": 0.19, "grad_norm": 1.1077777288700072, "learning_rate": 0.0009329043390054066, "loss": 4.0403, "step": 1679 }, { "epoch": 0.19, "grad_norm": 7.539112339196247, "learning_rate": 0.0009328113918434142, "loss": 4.1673, "step": 1680 }, { "epoch": 0.19, "grad_norm": 1.1670767075004143, "learning_rate": 0.0009327183849833406, "loss": 4.1587, "step": 1681 }, { "epoch": 0.19, "grad_norm": 1.9684272406988528, "learning_rate": 0.0009326253184380141, "loss": 3.8978, "step": 1682 }, { "epoch": 0.19, "grad_norm": 1.233687693170137, "learning_rate": 0.0009325321922202716, "loss": 4.165, "step": 1683 }, { "epoch": 0.19, "grad_norm": 1.15489221728649, "learning_rate": 0.0009324390063429578, "loss": 4.0118, "step": 1684 }, { "epoch": 0.19, "grad_norm": 2.3816232511551396, "learning_rate": 0.0009323457608189263, "loss": 4.0489, "step": 1685 }, { "epoch": 0.19, "grad_norm": 1.1777243240380733, "learning_rate": 0.0009322524556610384, "loss": 4.0713, "step": 1686 }, { "epoch": 0.19, "grad_norm": 1.4004760987931757, "learning_rate": 0.0009321590908821635, "loss": 3.9659, "step": 1687 }, { "epoch": 0.19, "grad_norm": 1.011844150989293, "learning_rate": 0.0009320656664951797, "loss": 3.9337, "step": 1688 }, { "epoch": 0.19, "grad_norm": 1.8199772574475173, "learning_rate": 0.0009319721825129734, "loss": 4.0797, "step": 1689 }, { "epoch": 0.19, "grad_norm": 1.1822345217533188, "learning_rate": 0.0009318786389484383, "loss": 4.0477, "step": 1690 }, { "epoch": 0.19, "grad_norm": 2.911830467058182, "learning_rate": 0.0009317850358144778, "loss": 3.9764, "step": 1691 }, { "epoch": 0.19, "grad_norm": 1.7100825846759797, "learning_rate": 0.0009316913731240018, "loss": 4.042, "step": 1692 }, { "epoch": 0.19, "grad_norm": 1.081233324923232, "learning_rate": 0.0009315976508899298, "loss": 4.1465, "step": 1693 }, { "epoch": 0.19, "grad_norm": 1.082848258426668, "learning_rate": 0.0009315038691251887, "loss": 4.3746, "step": 1694 }, { "epoch": 0.19, "grad_norm": 1.7200375115687558, "learning_rate": 0.0009314100278427143, "loss": 4.1446, "step": 1695 }, { "epoch": 0.19, "grad_norm": 2.939169147179145, "learning_rate": 0.0009313161270554498, "loss": 3.9169, "step": 1696 }, { "epoch": 0.19, "grad_norm": 1.2929355714792177, "learning_rate": 0.0009312221667763472, "loss": 3.9473, "step": 1697 }, { "epoch": 0.19, "grad_norm": 1.5248080277712768, "learning_rate": 0.0009311281470183667, "loss": 4.147, "step": 1698 }, { "epoch": 0.19, "grad_norm": 1.2141275701849568, "learning_rate": 0.0009310340677944762, "loss": 3.7619, "step": 1699 }, { "epoch": 0.19, "grad_norm": 1.2585995143520283, "learning_rate": 0.0009309399291176524, "loss": 3.8643, "step": 1700 }, { "epoch": 0.2, "grad_norm": 1.4798896161957205, "learning_rate": 0.0009308457310008798, "loss": 3.8553, "step": 1701 }, { "epoch": 0.2, "grad_norm": 1.1349306212286223, "learning_rate": 0.0009307514734571514, "loss": 3.8044, "step": 1702 }, { "epoch": 0.2, "grad_norm": 1.2084224558686099, "learning_rate": 0.0009306571564994679, "loss": 3.9526, "step": 1703 }, { "epoch": 0.2, "grad_norm": 1.162433250231776, "learning_rate": 0.000930562780140839, "loss": 4.203, "step": 1704 }, { "epoch": 0.2, "grad_norm": 1.1790333484493185, "learning_rate": 0.0009304683443942816, "loss": 3.9238, "step": 1705 }, { "epoch": 0.2, "grad_norm": 1.600829858706901, "learning_rate": 0.0009303738492728216, "loss": 4.3385, "step": 1706 }, { "epoch": 0.2, "grad_norm": 1.5396994703215223, "learning_rate": 0.000930279294789493, "loss": 4.0737, "step": 1707 }, { "epoch": 0.2, "grad_norm": 1.7240854610986631, "learning_rate": 0.0009301846809573373, "loss": 4.0215, "step": 1708 }, { "epoch": 0.2, "grad_norm": 1.3393898592556315, "learning_rate": 0.000930090007789405, "loss": 4.1072, "step": 1709 }, { "epoch": 0.2, "grad_norm": 1.0347141783569862, "learning_rate": 0.0009299952752987544, "loss": 4.1534, "step": 1710 }, { "epoch": 0.2, "grad_norm": 0.966407475124348, "learning_rate": 0.0009299004834984519, "loss": 4.1058, "step": 1711 }, { "epoch": 0.2, "grad_norm": 1.1744253197390826, "learning_rate": 0.0009298056324015724, "loss": 4.1533, "step": 1712 }, { "epoch": 0.2, "grad_norm": 1.0370486438055186, "learning_rate": 0.0009297107220211988, "loss": 3.8903, "step": 1713 }, { "epoch": 0.2, "grad_norm": 0.9981109237263459, "learning_rate": 0.0009296157523704223, "loss": 4.0912, "step": 1714 }, { "epoch": 0.2, "grad_norm": 1.044888667139545, "learning_rate": 0.0009295207234623418, "loss": 3.9143, "step": 1715 }, { "epoch": 0.2, "grad_norm": 2.7486125588550157, "learning_rate": 0.000929425635310065, "loss": 3.9444, "step": 1716 }, { "epoch": 0.2, "grad_norm": 1.1831191331560453, "learning_rate": 0.0009293304879267073, "loss": 4.1932, "step": 1717 }, { "epoch": 0.2, "grad_norm": 1.6004801696395872, "learning_rate": 0.0009292352813253926, "loss": 3.902, "step": 1718 }, { "epoch": 0.2, "grad_norm": 0.9430609008410357, "learning_rate": 0.0009291400155192528, "loss": 4.0424, "step": 1719 }, { "epoch": 0.2, "grad_norm": 1.069520550633942, "learning_rate": 0.0009290446905214281, "loss": 3.9443, "step": 1720 }, { "epoch": 0.2, "grad_norm": 1.037860665652639, "learning_rate": 0.0009289493063450666, "loss": 3.9188, "step": 1721 }, { "epoch": 0.2, "grad_norm": 2.032594538590189, "learning_rate": 0.0009288538630033247, "loss": 3.9384, "step": 1722 }, { "epoch": 0.2, "grad_norm": 1.228064835462027, "learning_rate": 0.0009287583605093674, "loss": 3.9134, "step": 1723 }, { "epoch": 0.2, "grad_norm": 0.9941241207060189, "learning_rate": 0.000928662798876367, "loss": 3.9517, "step": 1724 }, { "epoch": 0.2, "grad_norm": 1.7401155438716953, "learning_rate": 0.0009285671781175045, "loss": 3.913, "step": 1725 }, { "epoch": 0.2, "grad_norm": 1.2130439465626965, "learning_rate": 0.000928471498245969, "loss": 4.0622, "step": 1726 }, { "epoch": 0.2, "grad_norm": 1.166844698734072, "learning_rate": 0.0009283757592749577, "loss": 3.7524, "step": 1727 }, { "epoch": 0.2, "grad_norm": 1.361902052027812, "learning_rate": 0.0009282799612176762, "loss": 3.9207, "step": 1728 }, { "epoch": 0.2, "grad_norm": 0.9667177549694459, "learning_rate": 0.0009281841040873376, "loss": 4.0281, "step": 1729 }, { "epoch": 0.2, "grad_norm": 1.4353531647951272, "learning_rate": 0.0009280881878971637, "loss": 4.1683, "step": 1730 }, { "epoch": 0.2, "grad_norm": 1.060197211552497, "learning_rate": 0.0009279922126603846, "loss": 3.8177, "step": 1731 }, { "epoch": 0.2, "grad_norm": 1.0759155594269398, "learning_rate": 0.000927896178390238, "loss": 3.9615, "step": 1732 }, { "epoch": 0.2, "grad_norm": 1.606904808174467, "learning_rate": 0.0009278000850999699, "loss": 3.9761, "step": 1733 }, { "epoch": 0.2, "grad_norm": 0.955768775797239, "learning_rate": 0.0009277039328028347, "loss": 3.9047, "step": 1734 }, { "epoch": 0.2, "grad_norm": 1.1306837987390392, "learning_rate": 0.0009276077215120949, "loss": 4.0202, "step": 1735 }, { "epoch": 0.2, "grad_norm": 1.336100974571278, "learning_rate": 0.0009275114512410208, "loss": 4.1424, "step": 1736 }, { "epoch": 0.2, "grad_norm": 1.020583862585202, "learning_rate": 0.000927415122002891, "loss": 3.9656, "step": 1737 }, { "epoch": 0.2, "grad_norm": 1.1406699345798785, "learning_rate": 0.0009273187338109925, "loss": 3.8945, "step": 1738 }, { "epoch": 0.2, "grad_norm": 1.244124982888195, "learning_rate": 0.0009272222866786201, "loss": 3.9587, "step": 1739 }, { "epoch": 0.2, "grad_norm": 1.423240571860448, "learning_rate": 0.0009271257806190769, "loss": 3.8154, "step": 1740 }, { "epoch": 0.2, "grad_norm": 1.1915878943078342, "learning_rate": 0.0009270292156456738, "loss": 3.7836, "step": 1741 }, { "epoch": 0.2, "grad_norm": 0.9369691502641563, "learning_rate": 0.0009269325917717306, "loss": 4.0439, "step": 1742 }, { "epoch": 0.2, "grad_norm": 1.1073708380209586, "learning_rate": 0.0009268359090105743, "loss": 4.0351, "step": 1743 }, { "epoch": 0.2, "grad_norm": 1.3119073850439409, "learning_rate": 0.0009267391673755405, "loss": 3.9256, "step": 1744 }, { "epoch": 0.2, "grad_norm": 3.695716914138177, "learning_rate": 0.0009266423668799731, "loss": 3.9907, "step": 1745 }, { "epoch": 0.2, "grad_norm": 3.6105641769377645, "learning_rate": 0.0009265455075372237, "loss": 3.9463, "step": 1746 }, { "epoch": 0.2, "grad_norm": 1.1779617910160587, "learning_rate": 0.0009264485893606523, "loss": 4.1558, "step": 1747 }, { "epoch": 0.2, "grad_norm": 0.900233359231401, "learning_rate": 0.0009263516123636267, "loss": 3.9523, "step": 1748 }, { "epoch": 0.2, "grad_norm": 1.1858801707262305, "learning_rate": 0.0009262545765595232, "loss": 4.2544, "step": 1749 }, { "epoch": 0.2, "grad_norm": 1.3073465400433013, "learning_rate": 0.000926157481961726, "loss": 3.8637, "step": 1750 }, { "epoch": 0.2, "grad_norm": 1.1348274792477937, "learning_rate": 0.0009260603285836276, "loss": 3.8499, "step": 1751 }, { "epoch": 0.2, "grad_norm": 1.2319269303256337, "learning_rate": 0.0009259631164386282, "loss": 3.9436, "step": 1752 }, { "epoch": 0.2, "grad_norm": 2.2573583051123043, "learning_rate": 0.0009258658455401365, "loss": 4.1409, "step": 1753 }, { "epoch": 0.2, "grad_norm": 1.072893039228121, "learning_rate": 0.0009257685159015692, "loss": 3.9653, "step": 1754 }, { "epoch": 0.2, "grad_norm": 0.9846242849565428, "learning_rate": 0.0009256711275363509, "loss": 4.0616, "step": 1755 }, { "epoch": 0.2, "grad_norm": 1.860063933119687, "learning_rate": 0.0009255736804579147, "loss": 4.2312, "step": 1756 }, { "epoch": 0.2, "grad_norm": 1.0092375090171601, "learning_rate": 0.0009254761746797013, "loss": 3.9494, "step": 1757 }, { "epoch": 0.2, "grad_norm": 1.651459093455684, "learning_rate": 0.0009253786102151602, "loss": 4.0293, "step": 1758 }, { "epoch": 0.2, "grad_norm": 0.9858009657084182, "learning_rate": 0.0009252809870777481, "loss": 3.9778, "step": 1759 }, { "epoch": 0.2, "grad_norm": 0.9373144197142566, "learning_rate": 0.0009251833052809304, "loss": 3.8809, "step": 1760 }, { "epoch": 0.2, "grad_norm": 1.3812779377229834, "learning_rate": 0.0009250855648381805, "loss": 4.0979, "step": 1761 }, { "epoch": 0.2, "grad_norm": 0.9421798120037523, "learning_rate": 0.0009249877657629799, "loss": 3.9424, "step": 1762 }, { "epoch": 0.2, "grad_norm": 1.4497164620293566, "learning_rate": 0.0009248899080688178, "loss": 4.0048, "step": 1763 }, { "epoch": 0.2, "grad_norm": 2.6371577426569495, "learning_rate": 0.0009247919917691923, "loss": 3.9917, "step": 1764 }, { "epoch": 0.2, "grad_norm": 1.1572401082048003, "learning_rate": 0.0009246940168776086, "loss": 4.0143, "step": 1765 }, { "epoch": 0.2, "grad_norm": 0.8916848840368414, "learning_rate": 0.0009245959834075807, "loss": 4.015, "step": 1766 }, { "epoch": 0.2, "grad_norm": 1.4366222872268832, "learning_rate": 0.0009244978913726304, "loss": 3.9815, "step": 1767 }, { "epoch": 0.2, "grad_norm": 1.1568837993598007, "learning_rate": 0.0009243997407862878, "loss": 3.8459, "step": 1768 }, { "epoch": 0.2, "grad_norm": 0.9237954463567245, "learning_rate": 0.0009243015316620906, "loss": 4.0727, "step": 1769 }, { "epoch": 0.2, "grad_norm": 0.91370198264041, "learning_rate": 0.0009242032640135852, "loss": 3.961, "step": 1770 }, { "epoch": 0.2, "grad_norm": 1.0727108064378128, "learning_rate": 0.0009241049378543254, "loss": 4.1481, "step": 1771 }, { "epoch": 0.2, "grad_norm": 1.7711588953671622, "learning_rate": 0.0009240065531978736, "loss": 4.0044, "step": 1772 }, { "epoch": 0.2, "grad_norm": 0.9907775872630938, "learning_rate": 0.0009239081100578002, "loss": 4.2258, "step": 1773 }, { "epoch": 0.2, "grad_norm": 1.1396946674079174, "learning_rate": 0.0009238096084476832, "loss": 4.03, "step": 1774 }, { "epoch": 0.2, "grad_norm": 4.667288758003447, "learning_rate": 0.0009237110483811096, "loss": 4.0662, "step": 1775 }, { "epoch": 0.2, "grad_norm": 0.898073742317284, "learning_rate": 0.0009236124298716734, "loss": 4.0275, "step": 1776 }, { "epoch": 0.2, "grad_norm": 0.8832574224234054, "learning_rate": 0.0009235137529329772, "loss": 4.1248, "step": 1777 }, { "epoch": 0.2, "grad_norm": 1.259787090233513, "learning_rate": 0.0009234150175786318, "loss": 4.105, "step": 1778 }, { "epoch": 0.2, "grad_norm": 1.2478977285775896, "learning_rate": 0.0009233162238222556, "loss": 3.9974, "step": 1779 }, { "epoch": 0.2, "grad_norm": 1.7237341902025927, "learning_rate": 0.0009232173716774757, "loss": 3.8888, "step": 1780 }, { "epoch": 0.2, "grad_norm": 1.1781336554439632, "learning_rate": 0.0009231184611579265, "loss": 4.1694, "step": 1781 }, { "epoch": 0.2, "grad_norm": 2.310748710924791, "learning_rate": 0.000923019492277251, "loss": 4.0395, "step": 1782 }, { "epoch": 0.2, "grad_norm": 0.9047961151359055, "learning_rate": 0.0009229204650491001, "loss": 3.8939, "step": 1783 }, { "epoch": 0.2, "grad_norm": 0.9373725646782912, "learning_rate": 0.0009228213794871325, "loss": 4.033, "step": 1784 }, { "epoch": 0.2, "grad_norm": 0.9120922700268838, "learning_rate": 0.0009227222356050154, "loss": 3.992, "step": 1785 }, { "epoch": 0.2, "grad_norm": 0.9171269602542529, "learning_rate": 0.0009226230334164236, "loss": 3.9542, "step": 1786 }, { "epoch": 0.2, "grad_norm": 1.0158639402888126, "learning_rate": 0.0009225237729350403, "loss": 3.9805, "step": 1787 }, { "epoch": 0.21, "grad_norm": 1.1143160686781721, "learning_rate": 0.0009224244541745566, "loss": 4.0101, "step": 1788 }, { "epoch": 0.21, "grad_norm": 4.764763379403035, "learning_rate": 0.0009223250771486717, "loss": 4.227, "step": 1789 }, { "epoch": 0.21, "grad_norm": 0.9563256626518415, "learning_rate": 0.0009222256418710923, "loss": 3.9603, "step": 1790 }, { "epoch": 0.21, "grad_norm": 1.03432908719441, "learning_rate": 0.0009221261483555343, "loss": 3.8266, "step": 1791 }, { "epoch": 0.21, "grad_norm": 0.8726337630761427, "learning_rate": 0.0009220265966157205, "loss": 4.052, "step": 1792 }, { "epoch": 0.21, "grad_norm": 1.3782326020560451, "learning_rate": 0.0009219269866653823, "loss": 4.21, "step": 1793 }, { "epoch": 0.21, "grad_norm": 0.9606290492496129, "learning_rate": 0.0009218273185182588, "loss": 3.8903, "step": 1794 }, { "epoch": 0.21, "grad_norm": 1.1528528082833698, "learning_rate": 0.0009217275921880976, "loss": 4.1822, "step": 1795 }, { "epoch": 0.21, "grad_norm": 0.9376737041732423, "learning_rate": 0.000921627807688654, "loss": 4.1833, "step": 1796 }, { "epoch": 0.21, "grad_norm": 1.5017965948433791, "learning_rate": 0.0009215279650336911, "loss": 4.1035, "step": 1797 }, { "epoch": 0.21, "grad_norm": 1.072683472755091, "learning_rate": 0.0009214280642369806, "loss": 4.0124, "step": 1798 }, { "epoch": 0.21, "grad_norm": 1.1545569320975, "learning_rate": 0.0009213281053123018, "loss": 3.8911, "step": 1799 }, { "epoch": 0.21, "grad_norm": 1.3750429742291181, "learning_rate": 0.000921228088273442, "loss": 3.9633, "step": 1800 }, { "epoch": 0.21, "grad_norm": 1.3441856899396978, "learning_rate": 0.0009211280131341968, "loss": 4.0043, "step": 1801 }, { "epoch": 0.21, "grad_norm": 1.1896882866859428, "learning_rate": 0.0009210278799083695, "loss": 4.149, "step": 1802 }, { "epoch": 0.21, "grad_norm": 2.255949328393041, "learning_rate": 0.000920927688609772, "loss": 4.1405, "step": 1803 }, { "epoch": 0.21, "grad_norm": 1.2248613388249363, "learning_rate": 0.0009208274392522231, "loss": 3.832, "step": 1804 }, { "epoch": 0.21, "grad_norm": 1.1499103190626434, "learning_rate": 0.0009207271318495509, "loss": 3.9207, "step": 1805 }, { "epoch": 0.21, "grad_norm": 1.589513354446093, "learning_rate": 0.0009206267664155906, "loss": 3.9803, "step": 1806 }, { "epoch": 0.21, "grad_norm": 1.637492594270389, "learning_rate": 0.0009205263429641857, "loss": 4.0536, "step": 1807 }, { "epoch": 0.21, "grad_norm": 1.0027186279161895, "learning_rate": 0.0009204258615091879, "loss": 4.2106, "step": 1808 }, { "epoch": 0.21, "grad_norm": 0.9694951957035102, "learning_rate": 0.0009203253220644564, "loss": 4.148, "step": 1809 }, { "epoch": 0.21, "grad_norm": 1.1945540549527531, "learning_rate": 0.0009202247246438589, "loss": 3.9152, "step": 1810 }, { "epoch": 0.21, "grad_norm": 1.196932782805297, "learning_rate": 0.0009201240692612708, "loss": 4.1247, "step": 1811 }, { "epoch": 0.21, "grad_norm": 0.8465695768359607, "learning_rate": 0.0009200233559305758, "loss": 3.7639, "step": 1812 }, { "epoch": 0.21, "grad_norm": 1.0915310721046587, "learning_rate": 0.0009199225846656649, "loss": 4.0861, "step": 1813 }, { "epoch": 0.21, "grad_norm": 0.9003379540591894, "learning_rate": 0.0009198217554804382, "loss": 3.9261, "step": 1814 }, { "epoch": 0.21, "grad_norm": 1.9391380020820164, "learning_rate": 0.0009197208683888028, "loss": 3.9119, "step": 1815 }, { "epoch": 0.21, "grad_norm": 0.8604113169531732, "learning_rate": 0.0009196199234046741, "loss": 4.0339, "step": 1816 }, { "epoch": 0.21, "grad_norm": 1.503496250076568, "learning_rate": 0.0009195189205419757, "loss": 3.8113, "step": 1817 }, { "epoch": 0.21, "grad_norm": 1.0221305110920769, "learning_rate": 0.000919417859814639, "loss": 4.2474, "step": 1818 }, { "epoch": 0.21, "grad_norm": 1.0259652596663043, "learning_rate": 0.0009193167412366034, "loss": 4.1427, "step": 1819 }, { "epoch": 0.21, "grad_norm": 0.9828148289005327, "learning_rate": 0.0009192155648218162, "loss": 3.8969, "step": 1820 }, { "epoch": 0.21, "grad_norm": 0.979603130022155, "learning_rate": 0.0009191143305842329, "loss": 3.8377, "step": 1821 }, { "epoch": 0.21, "grad_norm": 1.8167280835128574, "learning_rate": 0.0009190130385378166, "loss": 3.9002, "step": 1822 }, { "epoch": 0.21, "grad_norm": 1.378418608671584, "learning_rate": 0.0009189116886965388, "loss": 3.8517, "step": 1823 }, { "epoch": 0.21, "grad_norm": 1.6648088065836824, "learning_rate": 0.0009188102810743788, "loss": 3.9965, "step": 1824 }, { "epoch": 0.21, "grad_norm": 1.0996312161984336, "learning_rate": 0.0009187088156853236, "loss": 3.8353, "step": 1825 }, { "epoch": 0.21, "grad_norm": 3.9731417420315633, "learning_rate": 0.0009186072925433689, "loss": 4.0632, "step": 1826 }, { "epoch": 0.21, "grad_norm": 1.3271402142396824, "learning_rate": 0.0009185057116625172, "loss": 3.9378, "step": 1827 }, { "epoch": 0.21, "grad_norm": 1.0826270883483753, "learning_rate": 0.0009184040730567803, "loss": 4.0848, "step": 1828 }, { "epoch": 0.21, "grad_norm": 1.0187491967496691, "learning_rate": 0.0009183023767401769, "loss": 3.8322, "step": 1829 }, { "epoch": 0.21, "grad_norm": 1.1821455402759855, "learning_rate": 0.0009182006227267343, "loss": 3.9061, "step": 1830 }, { "epoch": 0.21, "grad_norm": 1.937375350734978, "learning_rate": 0.0009180988110304873, "loss": 4.0991, "step": 1831 }, { "epoch": 0.21, "grad_norm": 1.4223198910164485, "learning_rate": 0.000917996941665479, "loss": 3.9215, "step": 1832 }, { "epoch": 0.21, "grad_norm": 1.0313632966596382, "learning_rate": 0.0009178950146457606, "loss": 4.074, "step": 1833 }, { "epoch": 0.21, "grad_norm": 1.6350631751387976, "learning_rate": 0.0009177930299853903, "loss": 3.9155, "step": 1834 }, { "epoch": 0.21, "grad_norm": 1.5099274703234156, "learning_rate": 0.0009176909876984356, "loss": 3.939, "step": 1835 }, { "epoch": 0.21, "grad_norm": 1.0083701958199836, "learning_rate": 0.0009175888877989712, "loss": 4.1037, "step": 1836 }, { "epoch": 0.21, "grad_norm": 1.1605607970725262, "learning_rate": 0.0009174867303010795, "loss": 4.0217, "step": 1837 }, { "epoch": 0.21, "grad_norm": 1.0673507508452864, "learning_rate": 0.0009173845152188516, "loss": 3.8118, "step": 1838 }, { "epoch": 0.21, "grad_norm": 0.9317664654215773, "learning_rate": 0.0009172822425663855, "loss": 4.0482, "step": 1839 }, { "epoch": 0.21, "grad_norm": 1.2917039208685146, "learning_rate": 0.0009171799123577886, "loss": 4.0362, "step": 1840 }, { "epoch": 0.21, "grad_norm": 4.029741476474188, "learning_rate": 0.0009170775246071747, "loss": 3.8445, "step": 1841 }, { "epoch": 0.21, "grad_norm": 2.036403907256877, "learning_rate": 0.0009169750793286667, "loss": 4.1888, "step": 1842 }, { "epoch": 0.21, "grad_norm": 0.973673205932964, "learning_rate": 0.0009168725765363946, "loss": 3.9088, "step": 1843 }, { "epoch": 0.21, "grad_norm": 2.1094145571643903, "learning_rate": 0.0009167700162444969, "loss": 4.2807, "step": 1844 }, { "epoch": 0.21, "grad_norm": 0.9326693239363606, "learning_rate": 0.0009166673984671198, "loss": 4.1753, "step": 1845 }, { "epoch": 0.21, "grad_norm": 0.9255526416603305, "learning_rate": 0.0009165647232184176, "loss": 3.7977, "step": 1846 }, { "epoch": 0.21, "grad_norm": 0.9805804014066567, "learning_rate": 0.0009164619905125522, "loss": 4.2448, "step": 1847 }, { "epoch": 0.21, "grad_norm": 1.2322562833929247, "learning_rate": 0.0009163592003636936, "loss": 4.228, "step": 1848 }, { "epoch": 0.21, "grad_norm": 1.1955767999438562, "learning_rate": 0.00091625635278602, "loss": 3.8148, "step": 1849 }, { "epoch": 0.21, "grad_norm": 1.4843731080773586, "learning_rate": 0.000916153447793717, "loss": 4.1448, "step": 1850 }, { "epoch": 0.21, "grad_norm": 0.9795152150145258, "learning_rate": 0.0009160504854009786, "loss": 3.9192, "step": 1851 }, { "epoch": 0.21, "grad_norm": 0.8457451143184798, "learning_rate": 0.0009159474656220063, "loss": 3.7049, "step": 1852 }, { "epoch": 0.21, "grad_norm": 1.6155251727522215, "learning_rate": 0.0009158443884710097, "loss": 3.9688, "step": 1853 }, { "epoch": 0.21, "grad_norm": 11.623277522851895, "learning_rate": 0.0009157412539622065, "loss": 4.0849, "step": 1854 }, { "epoch": 0.21, "grad_norm": 0.9667171283891963, "learning_rate": 0.0009156380621098221, "loss": 3.9106, "step": 1855 }, { "epoch": 0.21, "grad_norm": 1.3684382257225185, "learning_rate": 0.0009155348129280898, "loss": 4.0801, "step": 1856 }, { "epoch": 0.21, "grad_norm": 2.009070664240203, "learning_rate": 0.000915431506431251, "loss": 4.0532, "step": 1857 }, { "epoch": 0.21, "grad_norm": 1.079223649855216, "learning_rate": 0.0009153281426335547, "loss": 3.9181, "step": 1858 }, { "epoch": 0.21, "grad_norm": 1.732516789826969, "learning_rate": 0.0009152247215492577, "loss": 3.7961, "step": 1859 }, { "epoch": 0.21, "grad_norm": 1.4039541368494861, "learning_rate": 0.0009151212431926256, "loss": 3.8897, "step": 1860 }, { "epoch": 0.21, "grad_norm": 1.3368122929777513, "learning_rate": 0.0009150177075779308, "loss": 4.0189, "step": 1861 }, { "epoch": 0.21, "grad_norm": 1.1692397995992874, "learning_rate": 0.0009149141147194542, "loss": 3.9753, "step": 1862 }, { "epoch": 0.21, "grad_norm": 2.670973699877063, "learning_rate": 0.0009148104646314844, "loss": 4.1865, "step": 1863 }, { "epoch": 0.21, "grad_norm": 1.6326147393318697, "learning_rate": 0.000914706757328318, "loss": 4.1095, "step": 1864 }, { "epoch": 0.21, "grad_norm": 0.952264053754519, "learning_rate": 0.0009146029928242596, "loss": 3.8256, "step": 1865 }, { "epoch": 0.21, "grad_norm": 0.9449151785891863, "learning_rate": 0.0009144991711336214, "loss": 4.1363, "step": 1866 }, { "epoch": 0.21, "grad_norm": 1.423104381260123, "learning_rate": 0.0009143952922707235, "loss": 4.1404, "step": 1867 }, { "epoch": 0.21, "grad_norm": 2.2701459374699455, "learning_rate": 0.0009142913562498942, "loss": 4.1069, "step": 1868 }, { "epoch": 0.21, "grad_norm": 1.0422886127603468, "learning_rate": 0.0009141873630854694, "loss": 3.8124, "step": 1869 }, { "epoch": 0.21, "grad_norm": 1.39427314365708, "learning_rate": 0.0009140833127917929, "loss": 4.0035, "step": 1870 }, { "epoch": 0.21, "grad_norm": 1.0923992636345188, "learning_rate": 0.0009139792053832166, "loss": 3.8297, "step": 1871 }, { "epoch": 0.21, "grad_norm": 1.861664194461217, "learning_rate": 0.0009138750408741001, "loss": 3.8637, "step": 1872 }, { "epoch": 0.21, "grad_norm": 1.6573863549866426, "learning_rate": 0.000913770819278811, "loss": 4.1524, "step": 1873 }, { "epoch": 0.21, "grad_norm": 0.9031991428295303, "learning_rate": 0.0009136665406117244, "loss": 4.0622, "step": 1874 }, { "epoch": 0.21, "grad_norm": 1.203462785296127, "learning_rate": 0.0009135622048872238, "loss": 4.2753, "step": 1875 }, { "epoch": 0.22, "grad_norm": 1.372827442043114, "learning_rate": 0.0009134578121197002, "loss": 4.1079, "step": 1876 }, { "epoch": 0.22, "grad_norm": 1.9325130247085072, "learning_rate": 0.0009133533623235526, "loss": 4.0299, "step": 1877 }, { "epoch": 0.22, "grad_norm": 1.070831283744972, "learning_rate": 0.000913248855513188, "loss": 3.924, "step": 1878 }, { "epoch": 0.22, "grad_norm": 0.8760788604249685, "learning_rate": 0.0009131442917030211, "loss": 3.9644, "step": 1879 }, { "epoch": 0.22, "grad_norm": 1.5451558346252263, "learning_rate": 0.0009130396709074741, "loss": 3.9141, "step": 1880 }, { "epoch": 0.22, "grad_norm": 1.1946997652963685, "learning_rate": 0.0009129349931409781, "loss": 3.9695, "step": 1881 }, { "epoch": 0.22, "grad_norm": 1.122019362597373, "learning_rate": 0.0009128302584179708, "loss": 4.1036, "step": 1882 }, { "epoch": 0.22, "grad_norm": 1.5005720077537035, "learning_rate": 0.0009127254667528988, "loss": 3.8373, "step": 1883 }, { "epoch": 0.22, "grad_norm": 0.9473945309898467, "learning_rate": 0.0009126206181602158, "loss": 3.6843, "step": 1884 }, { "epoch": 0.22, "grad_norm": 1.3548481207852607, "learning_rate": 0.0009125157126543838, "loss": 4.202, "step": 1885 }, { "epoch": 0.22, "grad_norm": 1.0166651471885768, "learning_rate": 0.0009124107502498725, "loss": 3.9889, "step": 1886 }, { "epoch": 0.22, "grad_norm": 1.7668039666335544, "learning_rate": 0.0009123057309611595, "loss": 3.9279, "step": 1887 }, { "epoch": 0.22, "grad_norm": 0.980058500810732, "learning_rate": 0.0009122006548027302, "loss": 3.9683, "step": 1888 }, { "epoch": 0.22, "grad_norm": 0.9689977076282652, "learning_rate": 0.0009120955217890778, "loss": 3.9802, "step": 1889 }, { "epoch": 0.22, "grad_norm": 0.973277142800509, "learning_rate": 0.0009119903319347034, "loss": 3.9129, "step": 1890 }, { "epoch": 0.22, "grad_norm": 0.9594569793196825, "learning_rate": 0.000911885085254116, "loss": 4.189, "step": 1891 }, { "epoch": 0.22, "grad_norm": 1.6391285380068679, "learning_rate": 0.0009117797817618323, "loss": 3.8714, "step": 1892 }, { "epoch": 0.22, "grad_norm": 1.014198040779318, "learning_rate": 0.000911674421472377, "loss": 3.9637, "step": 1893 }, { "epoch": 0.22, "grad_norm": 3.0078316361569493, "learning_rate": 0.0009115690044002824, "loss": 4.2618, "step": 1894 }, { "epoch": 0.22, "grad_norm": 1.1158131813634597, "learning_rate": 0.0009114635305600889, "loss": 3.9545, "step": 1895 }, { "epoch": 0.22, "grad_norm": 0.8979461771866443, "learning_rate": 0.0009113579999663447, "loss": 4.0345, "step": 1896 }, { "epoch": 0.22, "grad_norm": 1.2881976094824341, "learning_rate": 0.0009112524126336054, "loss": 3.8682, "step": 1897 }, { "epoch": 0.22, "grad_norm": 0.9126821621645534, "learning_rate": 0.0009111467685764351, "loss": 3.7492, "step": 1898 }, { "epoch": 0.22, "grad_norm": 1.4502425514143897, "learning_rate": 0.0009110410678094051, "loss": 3.8911, "step": 1899 }, { "epoch": 0.22, "grad_norm": 1.4363409557002336, "learning_rate": 0.0009109353103470951, "loss": 4.0238, "step": 1900 }, { "epoch": 0.22, "grad_norm": 0.9583625228486753, "learning_rate": 0.0009108294962040921, "loss": 4.0189, "step": 1901 }, { "epoch": 0.22, "grad_norm": 1.1136700113392037, "learning_rate": 0.0009107236253949912, "loss": 3.8537, "step": 1902 }, { "epoch": 0.22, "grad_norm": 1.0762500383829918, "learning_rate": 0.0009106176979343955, "loss": 3.9044, "step": 1903 }, { "epoch": 0.22, "grad_norm": 3.1299754536569497, "learning_rate": 0.0009105117138369151, "loss": 4.0407, "step": 1904 }, { "epoch": 0.22, "grad_norm": 0.9185602162672337, "learning_rate": 0.0009104056731171691, "loss": 4.0102, "step": 1905 }, { "epoch": 0.22, "grad_norm": 1.2285193843239044, "learning_rate": 0.0009102995757897834, "loss": 4.1059, "step": 1906 }, { "epoch": 0.22, "grad_norm": 1.8647663925161893, "learning_rate": 0.0009101934218693923, "loss": 4.0156, "step": 1907 }, { "epoch": 0.22, "grad_norm": 1.3581981594046908, "learning_rate": 0.0009100872113706375, "loss": 4.1118, "step": 1908 }, { "epoch": 0.22, "grad_norm": 2.9227147544264027, "learning_rate": 0.0009099809443081691, "loss": 4.1052, "step": 1909 }, { "epoch": 0.22, "grad_norm": 1.0816814028759747, "learning_rate": 0.0009098746206966443, "loss": 4.0135, "step": 1910 }, { "epoch": 0.22, "grad_norm": 1.500542827945388, "learning_rate": 0.0009097682405507285, "loss": 3.9417, "step": 1911 }, { "epoch": 0.22, "grad_norm": 0.9133309654984146, "learning_rate": 0.0009096618038850948, "loss": 3.9377, "step": 1912 }, { "epoch": 0.22, "grad_norm": 1.1677322880625147, "learning_rate": 0.0009095553107144241, "loss": 4.1413, "step": 1913 }, { "epoch": 0.22, "grad_norm": 0.8903689855022074, "learning_rate": 0.0009094487610534052, "loss": 3.9264, "step": 1914 }, { "epoch": 0.22, "grad_norm": 2.7150344693632524, "learning_rate": 0.0009093421549167343, "loss": 4.1144, "step": 1915 }, { "epoch": 0.22, "grad_norm": 0.9284814436841481, "learning_rate": 0.0009092354923191161, "loss": 4.1358, "step": 1916 }, { "epoch": 0.22, "grad_norm": 2.506947365419339, "learning_rate": 0.0009091287732752624, "loss": 3.8327, "step": 1917 }, { "epoch": 0.22, "grad_norm": 1.5246431376512875, "learning_rate": 0.0009090219977998933, "loss": 4.0527, "step": 1918 }, { "epoch": 0.22, "grad_norm": 0.8577900895103014, "learning_rate": 0.000908915165907736, "loss": 4.026, "step": 1919 }, { "epoch": 0.22, "grad_norm": 1.0178880032132958, "learning_rate": 0.0009088082776135263, "loss": 4.0653, "step": 1920 }, { "epoch": 0.22, "grad_norm": 2.837869124268149, "learning_rate": 0.0009087013329320073, "loss": 4.0842, "step": 1921 }, { "epoch": 0.22, "grad_norm": 1.3921732837029521, "learning_rate": 0.0009085943318779301, "loss": 4.1027, "step": 1922 }, { "epoch": 0.22, "grad_norm": 0.960438377287129, "learning_rate": 0.0009084872744660532, "loss": 4.029, "step": 1923 }, { "epoch": 0.22, "grad_norm": 0.858529614613576, "learning_rate": 0.0009083801607111433, "loss": 3.7037, "step": 1924 }, { "epoch": 0.22, "grad_norm": 0.9921372716804847, "learning_rate": 0.0009082729906279746, "loss": 4.1679, "step": 1925 }, { "epoch": 0.22, "grad_norm": 1.847791494079528, "learning_rate": 0.0009081657642313292, "loss": 3.8135, "step": 1926 }, { "epoch": 0.22, "grad_norm": 1.0348600404849422, "learning_rate": 0.0009080584815359972, "loss": 4.1087, "step": 1927 }, { "epoch": 0.22, "grad_norm": 1.6655876585320317, "learning_rate": 0.0009079511425567759, "loss": 4.0085, "step": 1928 }, { "epoch": 0.22, "grad_norm": 1.0304365293357196, "learning_rate": 0.0009078437473084706, "loss": 4.1699, "step": 1929 }, { "epoch": 0.22, "grad_norm": 0.9773807003136226, "learning_rate": 0.0009077362958058946, "loss": 3.8927, "step": 1930 }, { "epoch": 0.22, "grad_norm": 1.0911376408367268, "learning_rate": 0.0009076287880638689, "loss": 3.8196, "step": 1931 }, { "epoch": 0.22, "grad_norm": 0.9339933046160588, "learning_rate": 0.0009075212240972218, "loss": 4.0119, "step": 1932 }, { "epoch": 0.22, "grad_norm": 4.4570386276950416, "learning_rate": 0.00090741360392079, "loss": 4.0801, "step": 1933 }, { "epoch": 0.22, "grad_norm": 0.960487877362655, "learning_rate": 0.0009073059275494176, "loss": 4.0568, "step": 1934 }, { "epoch": 0.22, "grad_norm": 1.1303026386660548, "learning_rate": 0.0009071981949979564, "loss": 3.9975, "step": 1935 }, { "epoch": 0.22, "grad_norm": 1.0422481156553995, "learning_rate": 0.000907090406281266, "loss": 4.0863, "step": 1936 }, { "epoch": 0.22, "grad_norm": 1.1633806331467766, "learning_rate": 0.000906982561414214, "loss": 3.8136, "step": 1937 }, { "epoch": 0.22, "grad_norm": 1.1336591247960954, "learning_rate": 0.0009068746604116755, "loss": 3.9622, "step": 1938 }, { "epoch": 0.22, "grad_norm": 0.9897714802063435, "learning_rate": 0.0009067667032885334, "loss": 3.9926, "step": 1939 }, { "epoch": 0.22, "grad_norm": 22.349373935517548, "learning_rate": 0.0009066586900596781, "loss": 3.9057, "step": 1940 }, { "epoch": 0.22, "grad_norm": 0.9100099729527112, "learning_rate": 0.000906550620740008, "loss": 3.9399, "step": 1941 }, { "epoch": 0.22, "grad_norm": 0.7977788087214611, "learning_rate": 0.0009064424953444296, "loss": 3.8625, "step": 1942 }, { "epoch": 0.22, "grad_norm": 0.8437534010925144, "learning_rate": 0.0009063343138878563, "loss": 4.1781, "step": 1943 }, { "epoch": 0.22, "grad_norm": 1.0031312013794704, "learning_rate": 0.0009062260763852099, "loss": 3.7776, "step": 1944 }, { "epoch": 0.22, "grad_norm": 1.2118103490211898, "learning_rate": 0.0009061177828514198, "loss": 3.9493, "step": 1945 }, { "epoch": 0.22, "grad_norm": 0.8549213927954328, "learning_rate": 0.0009060094333014226, "loss": 3.9345, "step": 1946 }, { "epoch": 0.22, "grad_norm": 0.9750917137362527, "learning_rate": 0.0009059010277501634, "loss": 4.0143, "step": 1947 }, { "epoch": 0.22, "grad_norm": 1.0691236094096144, "learning_rate": 0.0009057925662125946, "loss": 3.9127, "step": 1948 }, { "epoch": 0.22, "grad_norm": 1.5414076412499298, "learning_rate": 0.0009056840487036764, "loss": 3.9279, "step": 1949 }, { "epoch": 0.22, "grad_norm": 0.9997048046068227, "learning_rate": 0.0009055754752383768, "loss": 4.0621, "step": 1950 }, { "epoch": 0.22, "grad_norm": 1.4092696484125373, "learning_rate": 0.0009054668458316713, "loss": 3.886, "step": 1951 }, { "epoch": 0.22, "grad_norm": 1.3024489061406581, "learning_rate": 0.0009053581604985433, "loss": 4.0258, "step": 1952 }, { "epoch": 0.22, "grad_norm": 0.9840677840648923, "learning_rate": 0.000905249419253984, "loss": 3.8981, "step": 1953 }, { "epoch": 0.22, "grad_norm": 0.879542333524627, "learning_rate": 0.0009051406221129919, "loss": 3.8966, "step": 1954 }, { "epoch": 0.22, "grad_norm": 1.4165876080585225, "learning_rate": 0.0009050317690905737, "loss": 3.8378, "step": 1955 }, { "epoch": 0.22, "grad_norm": 1.309475444053492, "learning_rate": 0.0009049228602017437, "loss": 3.8193, "step": 1956 }, { "epoch": 0.22, "grad_norm": 1.037458692721327, "learning_rate": 0.0009048138954615235, "loss": 3.9698, "step": 1957 }, { "epoch": 0.22, "grad_norm": 3.3298482092028023, "learning_rate": 0.0009047048748849429, "loss": 4.1667, "step": 1958 }, { "epoch": 0.22, "grad_norm": 0.9838750214866557, "learning_rate": 0.0009045957984870393, "loss": 4.0608, "step": 1959 }, { "epoch": 0.22, "grad_norm": 0.9767381916375503, "learning_rate": 0.0009044866662828575, "loss": 3.865, "step": 1960 }, { "epoch": 0.22, "grad_norm": 0.954432229490279, "learning_rate": 0.0009043774782874503, "loss": 4.1418, "step": 1961 }, { "epoch": 0.22, "grad_norm": 0.886971388188184, "learning_rate": 0.0009042682345158781, "loss": 4.0844, "step": 1962 }, { "epoch": 0.23, "grad_norm": 0.9367009776560742, "learning_rate": 0.0009041589349832091, "loss": 4.0568, "step": 1963 }, { "epoch": 0.23, "grad_norm": 2.3719473133897466, "learning_rate": 0.000904049579704519, "loss": 3.8868, "step": 1964 }, { "epoch": 0.23, "grad_norm": 1.3456464755553164, "learning_rate": 0.0009039401686948912, "loss": 4.209, "step": 1965 }, { "epoch": 0.23, "grad_norm": 1.2225479259146466, "learning_rate": 0.0009038307019694169, "loss": 3.8316, "step": 1966 }, { "epoch": 0.23, "grad_norm": 1.7925507145650856, "learning_rate": 0.000903721179543195, "loss": 3.8585, "step": 1967 }, { "epoch": 0.23, "grad_norm": 1.6076413995906005, "learning_rate": 0.0009036116014313321, "loss": 3.9579, "step": 1968 }, { "epoch": 0.23, "grad_norm": 0.9781377371280262, "learning_rate": 0.0009035019676489422, "loss": 3.8143, "step": 1969 }, { "epoch": 0.23, "grad_norm": 0.9362869013718167, "learning_rate": 0.0009033922782111473, "loss": 3.8252, "step": 1970 }, { "epoch": 0.23, "grad_norm": 1.5549605165514053, "learning_rate": 0.000903282533133077, "loss": 4.2321, "step": 1971 }, { "epoch": 0.23, "grad_norm": 1.1526796222877889, "learning_rate": 0.0009031727324298686, "loss": 3.7528, "step": 1972 }, { "epoch": 0.23, "grad_norm": 1.021906428246368, "learning_rate": 0.0009030628761166668, "loss": 4.0817, "step": 1973 }, { "epoch": 0.23, "grad_norm": 1.2363946924866331, "learning_rate": 0.0009029529642086245, "loss": 4.0143, "step": 1974 }, { "epoch": 0.23, "grad_norm": 1.1279737885339112, "learning_rate": 0.0009028429967209015, "loss": 4.1142, "step": 1975 }, { "epoch": 0.23, "grad_norm": 1.342514208791472, "learning_rate": 0.0009027329736686663, "loss": 4.0271, "step": 1976 }, { "epoch": 0.23, "grad_norm": 1.263216579333727, "learning_rate": 0.000902622895067094, "loss": 3.9282, "step": 1977 }, { "epoch": 0.23, "grad_norm": 1.2627452410363456, "learning_rate": 0.000902512760931368, "loss": 3.9883, "step": 1978 }, { "epoch": 0.23, "grad_norm": 1.3018333427819169, "learning_rate": 0.0009024025712766792, "loss": 4.1194, "step": 1979 }, { "epoch": 0.23, "grad_norm": 5.021171093861207, "learning_rate": 0.0009022923261182264, "loss": 4.058, "step": 1980 }, { "epoch": 0.23, "grad_norm": 1.026491449150726, "learning_rate": 0.0009021820254712153, "loss": 4.0586, "step": 1981 }, { "epoch": 0.23, "grad_norm": 1.0107292658948865, "learning_rate": 0.0009020716693508602, "loss": 3.9737, "step": 1982 }, { "epoch": 0.23, "grad_norm": 1.0481487677941677, "learning_rate": 0.0009019612577723826, "loss": 3.8052, "step": 1983 }, { "epoch": 0.23, "grad_norm": 0.9350433123947498, "learning_rate": 0.0009018507907510114, "loss": 3.9036, "step": 1984 }, { "epoch": 0.23, "grad_norm": 1.673119780916702, "learning_rate": 0.0009017402683019838, "loss": 4.0819, "step": 1985 }, { "epoch": 0.23, "grad_norm": 2.013157845046089, "learning_rate": 0.0009016296904405439, "loss": 3.7729, "step": 1986 }, { "epoch": 0.23, "grad_norm": 0.8864576825445842, "learning_rate": 0.0009015190571819438, "loss": 4.0647, "step": 1987 }, { "epoch": 0.23, "grad_norm": 1.0305201581737629, "learning_rate": 0.0009014083685414437, "loss": 4.0596, "step": 1988 }, { "epoch": 0.23, "grad_norm": 1.6867250802603115, "learning_rate": 0.0009012976245343106, "loss": 3.9806, "step": 1989 }, { "epoch": 0.23, "grad_norm": 1.0524985671359481, "learning_rate": 0.0009011868251758195, "loss": 4.1356, "step": 1990 }, { "epoch": 0.23, "grad_norm": 1.7019113930738348, "learning_rate": 0.0009010759704812533, "loss": 4.1041, "step": 1991 }, { "epoch": 0.23, "grad_norm": 1.0811451068275433, "learning_rate": 0.0009009650604659023, "loss": 3.8564, "step": 1992 }, { "epoch": 0.23, "grad_norm": 1.071176424701606, "learning_rate": 0.0009008540951450641, "loss": 3.9715, "step": 1993 }, { "epoch": 0.23, "grad_norm": 1.060142779467348, "learning_rate": 0.0009007430745340446, "loss": 4.1254, "step": 1994 }, { "epoch": 0.23, "grad_norm": 1.0798965223762322, "learning_rate": 0.0009006319986481567, "loss": 4.1675, "step": 1995 }, { "epoch": 0.23, "grad_norm": 1.0013860737945652, "learning_rate": 0.0009005208675027215, "loss": 3.8626, "step": 1996 }, { "epoch": 0.23, "grad_norm": 1.1111403580199535, "learning_rate": 0.000900409681113067, "loss": 4.0952, "step": 1997 }, { "epoch": 0.23, "grad_norm": 1.8127265094509648, "learning_rate": 0.0009002984394945298, "loss": 4.0653, "step": 1998 }, { "epoch": 0.23, "grad_norm": 1.0416729202850428, "learning_rate": 0.0009001871426624528, "loss": 3.9621, "step": 1999 }, { "epoch": 0.23, "grad_norm": 1.1739643669369002, "learning_rate": 0.0009000757906321882, "loss": 3.6983, "step": 2000 }, { "epoch": 0.23, "grad_norm": 0.9986766154977568, "learning_rate": 0.0008999643834190941, "loss": 4.2043, "step": 2001 }, { "epoch": 0.23, "grad_norm": 1.0451698744057785, "learning_rate": 0.0008998529210385375, "loss": 4.1062, "step": 2002 }, { "epoch": 0.23, "grad_norm": 1.3161592139632998, "learning_rate": 0.0008997414035058922, "loss": 3.9019, "step": 2003 }, { "epoch": 0.23, "grad_norm": 1.0229727857098334, "learning_rate": 0.0008996298308365403, "loss": 3.8998, "step": 2004 }, { "epoch": 0.23, "grad_norm": 1.669307992393807, "learning_rate": 0.0008995182030458707, "loss": 3.9813, "step": 2005 }, { "epoch": 0.23, "grad_norm": 0.9335905740116022, "learning_rate": 0.0008994065201492804, "loss": 4.1246, "step": 2006 }, { "epoch": 0.23, "grad_norm": 4.266479440097523, "learning_rate": 0.0008992947821621741, "loss": 3.7899, "step": 2007 }, { "epoch": 0.23, "grad_norm": 0.8899877906023115, "learning_rate": 0.0008991829890999639, "loss": 3.9373, "step": 2008 }, { "epoch": 0.23, "grad_norm": 1.0701194326984043, "learning_rate": 0.0008990711409780694, "loss": 4.16, "step": 2009 }, { "epoch": 0.23, "grad_norm": 1.5170868615097475, "learning_rate": 0.000898959237811918, "loss": 4.1377, "step": 2010 }, { "epoch": 0.23, "grad_norm": 0.9832369203956649, "learning_rate": 0.0008988472796169447, "loss": 3.9577, "step": 2011 }, { "epoch": 0.23, "grad_norm": 1.3099289773200549, "learning_rate": 0.0008987352664085919, "loss": 4.1836, "step": 2012 }, { "epoch": 0.23, "grad_norm": 0.9329165460197063, "learning_rate": 0.0008986231982023097, "loss": 3.8749, "step": 2013 }, { "epoch": 0.23, "grad_norm": 1.1023103589529903, "learning_rate": 0.0008985110750135556, "loss": 3.9516, "step": 2014 }, { "epoch": 0.23, "grad_norm": 1.6284055012097072, "learning_rate": 0.0008983988968577951, "loss": 3.8503, "step": 2015 }, { "epoch": 0.23, "grad_norm": 1.0603030360487213, "learning_rate": 0.0008982866637505009, "loss": 4.0674, "step": 2016 }, { "epoch": 0.23, "grad_norm": 1.0105025245170962, "learning_rate": 0.0008981743757071535, "loss": 4.2082, "step": 2017 }, { "epoch": 0.23, "grad_norm": 1.5131782326190812, "learning_rate": 0.000898062032743241, "loss": 4.2443, "step": 2018 }, { "epoch": 0.23, "grad_norm": 1.0290835728870742, "learning_rate": 0.0008979496348742586, "loss": 4.0201, "step": 2019 }, { "epoch": 0.23, "grad_norm": 1.1571711808704603, "learning_rate": 0.0008978371821157098, "loss": 3.9338, "step": 2020 }, { "epoch": 0.23, "grad_norm": 0.9907953763828348, "learning_rate": 0.0008977246744831052, "loss": 3.8589, "step": 2021 }, { "epoch": 0.23, "grad_norm": 0.885733864163197, "learning_rate": 0.0008976121119919631, "loss": 4.023, "step": 2022 }, { "epoch": 0.23, "grad_norm": 0.9645015124655544, "learning_rate": 0.0008974994946578093, "loss": 3.8316, "step": 2023 }, { "epoch": 0.23, "grad_norm": 1.0017782009796345, "learning_rate": 0.0008973868224961772, "loss": 4.2475, "step": 2024 }, { "epoch": 0.23, "grad_norm": 1.7025345383137735, "learning_rate": 0.0008972740955226079, "loss": 4.0555, "step": 2025 }, { "epoch": 0.23, "grad_norm": 1.1881639323148039, "learning_rate": 0.0008971613137526498, "loss": 3.879, "step": 2026 }, { "epoch": 0.23, "grad_norm": 1.890607333892431, "learning_rate": 0.000897048477201859, "loss": 3.9451, "step": 2027 }, { "epoch": 0.23, "grad_norm": 1.077133380644248, "learning_rate": 0.0008969355858857994, "loss": 3.7821, "step": 2028 }, { "epoch": 0.23, "grad_norm": 1.076370051509363, "learning_rate": 0.0008968226398200418, "loss": 3.7425, "step": 2029 }, { "epoch": 0.23, "grad_norm": 0.9496653602402219, "learning_rate": 0.0008967096390201652, "loss": 4.0805, "step": 2030 }, { "epoch": 0.23, "grad_norm": 0.7879627273051136, "learning_rate": 0.000896596583501756, "loss": 4.0531, "step": 2031 }, { "epoch": 0.23, "grad_norm": 1.8804967811734177, "learning_rate": 0.0008964834732804078, "loss": 3.939, "step": 2032 }, { "epoch": 0.23, "grad_norm": 0.43020591200231684, "learning_rate": 0.0008963703083717222, "loss": 3.7995, "step": 2033 }, { "epoch": 0.23, "grad_norm": 0.8743740828870921, "learning_rate": 0.000896257088791308, "loss": 3.8186, "step": 2034 }, { "epoch": 0.23, "grad_norm": 2.1177009444777775, "learning_rate": 0.0008961438145547818, "loss": 4.0586, "step": 2035 }, { "epoch": 0.23, "grad_norm": 1.1812492570287723, "learning_rate": 0.0008960304856777675, "loss": 4.0405, "step": 2036 }, { "epoch": 0.23, "grad_norm": 0.9488210322342343, "learning_rate": 0.0008959171021758967, "loss": 4.1962, "step": 2037 }, { "epoch": 0.23, "grad_norm": 0.9091879567056492, "learning_rate": 0.0008958036640648086, "loss": 4.1541, "step": 2038 }, { "epoch": 0.23, "grad_norm": 0.8964234932314026, "learning_rate": 0.0008956901713601499, "loss": 4.0242, "step": 2039 }, { "epoch": 0.23, "grad_norm": 1.1347308385703163, "learning_rate": 0.0008955766240775745, "loss": 3.8195, "step": 2040 }, { "epoch": 0.23, "grad_norm": 0.948568626880418, "learning_rate": 0.000895463022232744, "loss": 3.6306, "step": 2041 }, { "epoch": 0.23, "grad_norm": 1.0338597655885446, "learning_rate": 0.0008953493658413279, "loss": 4.0233, "step": 2042 }, { "epoch": 0.23, "grad_norm": 0.9193716543251867, "learning_rate": 0.0008952356549190028, "loss": 4.0474, "step": 2043 }, { "epoch": 0.23, "grad_norm": 4.10464134860929, "learning_rate": 0.000895121889481453, "loss": 3.8751, "step": 2044 }, { "epoch": 0.23, "grad_norm": 1.0263751916388024, "learning_rate": 0.0008950080695443704, "loss": 3.845, "step": 2045 }, { "epoch": 0.23, "grad_norm": 1.0454657444500637, "learning_rate": 0.000894894195123454, "loss": 4.0189, "step": 2046 }, { "epoch": 0.23, "grad_norm": 1.1070452345392372, "learning_rate": 0.0008947802662344108, "loss": 3.9612, "step": 2047 }, { "epoch": 0.23, "grad_norm": 1.5198351693253676, "learning_rate": 0.0008946662828929551, "loss": 3.7978, "step": 2048 }, { "epoch": 0.23, "grad_norm": 2.6182891831718784, "learning_rate": 0.0008945522451148086, "loss": 3.7381, "step": 2049 }, { "epoch": 0.24, "grad_norm": 1.0727562685891734, "learning_rate": 0.0008944381529157008, "loss": 3.9061, "step": 2050 }, { "epoch": 0.24, "grad_norm": 1.2540916131094588, "learning_rate": 0.0008943240063113685, "loss": 4.1883, "step": 2051 }, { "epoch": 0.24, "grad_norm": 0.9540681442206459, "learning_rate": 0.0008942098053175559, "loss": 3.7779, "step": 2052 }, { "epoch": 0.24, "grad_norm": 0.9473036689883482, "learning_rate": 0.0008940955499500152, "loss": 3.7903, "step": 2053 }, { "epoch": 0.24, "grad_norm": 1.2207060604687816, "learning_rate": 0.0008939812402245053, "loss": 3.9716, "step": 2054 }, { "epoch": 0.24, "grad_norm": 1.1344728414898653, "learning_rate": 0.0008938668761567934, "loss": 4.0965, "step": 2055 }, { "epoch": 0.24, "grad_norm": 1.84939249635288, "learning_rate": 0.0008937524577626537, "loss": 4.2092, "step": 2056 }, { "epoch": 0.24, "grad_norm": 1.2266450508963505, "learning_rate": 0.0008936379850578679, "loss": 4.0495, "step": 2057 }, { "epoch": 0.24, "grad_norm": 1.05881587990076, "learning_rate": 0.0008935234580582258, "loss": 3.8344, "step": 2058 }, { "epoch": 0.24, "grad_norm": 1.023844708578151, "learning_rate": 0.0008934088767795236, "loss": 4.0504, "step": 2059 }, { "epoch": 0.24, "grad_norm": 1.1090208465298053, "learning_rate": 0.0008932942412375662, "loss": 3.9084, "step": 2060 }, { "epoch": 0.24, "grad_norm": 6.191892484523619, "learning_rate": 0.0008931795514481648, "loss": 3.9365, "step": 2061 }, { "epoch": 0.24, "grad_norm": 2.4551412194521505, "learning_rate": 0.0008930648074271391, "loss": 3.9729, "step": 2062 }, { "epoch": 0.24, "grad_norm": 1.284863538609293, "learning_rate": 0.0008929500091903158, "loss": 4.2326, "step": 2063 }, { "epoch": 0.24, "grad_norm": 0.854189460519284, "learning_rate": 0.0008928351567535289, "loss": 3.9441, "step": 2064 }, { "epoch": 0.24, "grad_norm": 1.917333373188477, "learning_rate": 0.0008927202501326204, "loss": 3.84, "step": 2065 }, { "epoch": 0.24, "grad_norm": 0.869514234440801, "learning_rate": 0.0008926052893434391, "loss": 3.7208, "step": 2066 }, { "epoch": 0.24, "grad_norm": 0.836383379017066, "learning_rate": 0.000892490274401842, "loss": 3.8422, "step": 2067 }, { "epoch": 0.24, "grad_norm": 1.0820368045951274, "learning_rate": 0.0008923752053236931, "loss": 3.9975, "step": 2068 }, { "epoch": 0.24, "grad_norm": 1.0361157437826334, "learning_rate": 0.0008922600821248638, "loss": 3.831, "step": 2069 }, { "epoch": 0.24, "grad_norm": 1.3909792217968016, "learning_rate": 0.0008921449048212336, "loss": 3.9263, "step": 2070 }, { "epoch": 0.24, "grad_norm": 1.3131820894717838, "learning_rate": 0.0008920296734286886, "loss": 3.9647, "step": 2071 }, { "epoch": 0.24, "grad_norm": 1.56882258295789, "learning_rate": 0.000891914387963123, "loss": 3.9528, "step": 2072 }, { "epoch": 0.24, "grad_norm": 0.9352251229233741, "learning_rate": 0.0008917990484404382, "loss": 3.9273, "step": 2073 }, { "epoch": 0.24, "grad_norm": 1.2606150078618286, "learning_rate": 0.0008916836548765427, "loss": 3.9786, "step": 2074 }, { "epoch": 0.24, "grad_norm": 0.9596900336362892, "learning_rate": 0.0008915682072873535, "loss": 4.0439, "step": 2075 }, { "epoch": 0.24, "grad_norm": 2.0043157716567244, "learning_rate": 0.000891452705688794, "loss": 3.7733, "step": 2076 }, { "epoch": 0.24, "grad_norm": 4.497346139485361, "learning_rate": 0.0008913371500967955, "loss": 4.2272, "step": 2077 }, { "epoch": 0.24, "grad_norm": 0.8802089692582836, "learning_rate": 0.0008912215405272967, "loss": 3.8426, "step": 2078 }, { "epoch": 0.24, "grad_norm": 0.9278963451034087, "learning_rate": 0.0008911058769962437, "loss": 3.8375, "step": 2079 }, { "epoch": 0.24, "grad_norm": 1.0273169114334528, "learning_rate": 0.0008909901595195902, "loss": 4.3423, "step": 2080 }, { "epoch": 0.24, "grad_norm": 1.0387837887757536, "learning_rate": 0.0008908743881132972, "loss": 4.0243, "step": 2081 }, { "epoch": 0.24, "grad_norm": 1.5294364160926166, "learning_rate": 0.000890758562793333, "loss": 4.0217, "step": 2082 }, { "epoch": 0.24, "grad_norm": 1.2650750009905296, "learning_rate": 0.0008906426835756736, "loss": 4.0567, "step": 2083 }, { "epoch": 0.24, "grad_norm": 1.1319091939750436, "learning_rate": 0.0008905267504763024, "loss": 3.9189, "step": 2084 }, { "epoch": 0.24, "grad_norm": 0.8479179321824339, "learning_rate": 0.00089041076351121, "loss": 4.088, "step": 2085 }, { "epoch": 0.24, "grad_norm": 1.3316879311532206, "learning_rate": 0.0008902947226963946, "loss": 3.8532, "step": 2086 }, { "epoch": 0.24, "grad_norm": 1.8342896984550894, "learning_rate": 0.0008901786280478621, "loss": 3.915, "step": 2087 }, { "epoch": 0.24, "grad_norm": 1.0335952357584823, "learning_rate": 0.0008900624795816252, "loss": 3.773, "step": 2088 }, { "epoch": 0.24, "grad_norm": 0.8413572536892177, "learning_rate": 0.0008899462773137047, "loss": 4.2052, "step": 2089 }, { "epoch": 0.24, "grad_norm": 0.9470308165839116, "learning_rate": 0.0008898300212601281, "loss": 3.6706, "step": 2090 }, { "epoch": 0.24, "grad_norm": 1.8280065022152538, "learning_rate": 0.0008897137114369309, "loss": 3.8003, "step": 2091 }, { "epoch": 0.24, "grad_norm": 1.0128672903789542, "learning_rate": 0.000889597347860156, "loss": 4.0327, "step": 2092 }, { "epoch": 0.24, "grad_norm": 1.3286580309260279, "learning_rate": 0.0008894809305458534, "loss": 4.0086, "step": 2093 }, { "epoch": 0.24, "grad_norm": 1.1304607323269529, "learning_rate": 0.0008893644595100803, "loss": 3.9637, "step": 2094 }, { "epoch": 0.24, "grad_norm": 4.278171941310434, "learning_rate": 0.0008892479347689022, "loss": 4.1391, "step": 2095 }, { "epoch": 0.24, "grad_norm": 1.0849616723174316, "learning_rate": 0.0008891313563383911, "loss": 4.0198, "step": 2096 }, { "epoch": 0.24, "grad_norm": 1.4120792157962574, "learning_rate": 0.0008890147242346272, "loss": 4.1693, "step": 2097 }, { "epoch": 0.24, "grad_norm": 1.0237370985853524, "learning_rate": 0.0008888980384736972, "loss": 3.9531, "step": 2098 }, { "epoch": 0.24, "grad_norm": 0.9652459857046282, "learning_rate": 0.0008887812990716957, "loss": 3.9038, "step": 2099 }, { "epoch": 0.24, "grad_norm": 2.011238290629822, "learning_rate": 0.000888664506044725, "loss": 4.0821, "step": 2100 }, { "epoch": 0.24, "grad_norm": 0.951590451161568, "learning_rate": 0.0008885476594088943, "loss": 4.0399, "step": 2101 }, { "epoch": 0.24, "grad_norm": 2.362831987011565, "learning_rate": 0.0008884307591803203, "loss": 3.9662, "step": 2102 }, { "epoch": 0.24, "grad_norm": 0.912387207600123, "learning_rate": 0.0008883138053751274, "loss": 4.0072, "step": 2103 }, { "epoch": 0.24, "grad_norm": 1.0852343600711196, "learning_rate": 0.0008881967980094469, "loss": 4.0407, "step": 2104 }, { "epoch": 0.24, "grad_norm": 0.9904866038257052, "learning_rate": 0.0008880797370994178, "loss": 3.8319, "step": 2105 }, { "epoch": 0.24, "grad_norm": 0.9652605550032656, "learning_rate": 0.0008879626226611865, "loss": 3.8693, "step": 2106 }, { "epoch": 0.24, "grad_norm": 1.1273299876156149, "learning_rate": 0.0008878454547109065, "loss": 3.9072, "step": 2107 }, { "epoch": 0.24, "grad_norm": 0.8506031545784607, "learning_rate": 0.0008877282332647392, "loss": 3.84, "step": 2108 }, { "epoch": 0.24, "grad_norm": 0.9533073453728467, "learning_rate": 0.0008876109583388528, "loss": 3.9703, "step": 2109 }, { "epoch": 0.24, "grad_norm": 1.0337487417718063, "learning_rate": 0.0008874936299494232, "loss": 3.8001, "step": 2110 }, { "epoch": 0.24, "grad_norm": 1.177911687080207, "learning_rate": 0.0008873762481126337, "loss": 4.0315, "step": 2111 }, { "epoch": 0.24, "grad_norm": 1.903743085927802, "learning_rate": 0.0008872588128446749, "loss": 3.959, "step": 2112 }, { "epoch": 0.24, "grad_norm": 1.3318870889268482, "learning_rate": 0.0008871413241617446, "loss": 3.9657, "step": 2113 }, { "epoch": 0.24, "grad_norm": 1.327931266970969, "learning_rate": 0.0008870237820800482, "loss": 3.982, "step": 2114 }, { "epoch": 0.24, "grad_norm": 0.9906699821543572, "learning_rate": 0.0008869061866157985, "loss": 4.0723, "step": 2115 }, { "epoch": 0.24, "grad_norm": 0.8545125996360657, "learning_rate": 0.0008867885377852153, "loss": 3.8907, "step": 2116 }, { "epoch": 0.24, "grad_norm": 1.0406375454413146, "learning_rate": 0.0008866708356045263, "loss": 3.6435, "step": 2117 }, { "epoch": 0.24, "grad_norm": 1.320718867780087, "learning_rate": 0.0008865530800899661, "loss": 3.8643, "step": 2118 }, { "epoch": 0.24, "grad_norm": 0.9762448206581784, "learning_rate": 0.000886435271257777, "loss": 4.0482, "step": 2119 }, { "epoch": 0.24, "grad_norm": 1.4501595465805373, "learning_rate": 0.0008863174091242083, "loss": 4.025, "step": 2120 }, { "epoch": 0.24, "grad_norm": 0.8142731165605451, "learning_rate": 0.0008861994937055167, "loss": 3.98, "step": 2121 }, { "epoch": 0.24, "grad_norm": 0.8951722146545922, "learning_rate": 0.0008860815250179668, "loss": 3.9186, "step": 2122 }, { "epoch": 0.24, "grad_norm": 0.9394359799082707, "learning_rate": 0.00088596350307783, "loss": 3.9172, "step": 2123 }, { "epoch": 0.24, "grad_norm": 2.4128874025061235, "learning_rate": 0.0008858454279013848, "loss": 4.0986, "step": 2124 }, { "epoch": 0.24, "grad_norm": 1.45818753887707, "learning_rate": 0.0008857272995049178, "loss": 4.0095, "step": 2125 }, { "epoch": 0.24, "grad_norm": 1.9112397360341349, "learning_rate": 0.0008856091179047225, "loss": 4.2679, "step": 2126 }, { "epoch": 0.24, "grad_norm": 1.2059772543188052, "learning_rate": 0.0008854908831170998, "loss": 3.9446, "step": 2127 }, { "epoch": 0.24, "grad_norm": 1.4762559168383609, "learning_rate": 0.0008853725951583578, "loss": 3.9682, "step": 2128 }, { "epoch": 0.24, "grad_norm": 1.1411595938087236, "learning_rate": 0.0008852542540448123, "loss": 3.8284, "step": 2129 }, { "epoch": 0.24, "grad_norm": 1.0158489372019186, "learning_rate": 0.0008851358597927859, "loss": 3.8727, "step": 2130 }, { "epoch": 0.24, "grad_norm": 1.0121907260370762, "learning_rate": 0.0008850174124186091, "loss": 3.8716, "step": 2131 }, { "epoch": 0.24, "grad_norm": 1.0860033465162582, "learning_rate": 0.0008848989119386193, "loss": 4.0511, "step": 2132 }, { "epoch": 0.24, "grad_norm": 1.1605096205445216, "learning_rate": 0.0008847803583691614, "loss": 4.0098, "step": 2133 }, { "epoch": 0.24, "grad_norm": 0.8653397584095837, "learning_rate": 0.0008846617517265878, "loss": 4.0597, "step": 2134 }, { "epoch": 0.24, "grad_norm": 1.0989354195050696, "learning_rate": 0.0008845430920272578, "loss": 3.9111, "step": 2135 }, { "epoch": 0.24, "grad_norm": 1.106177569146193, "learning_rate": 0.0008844243792875384, "loss": 4.2583, "step": 2136 }, { "epoch": 0.25, "grad_norm": 0.9430127753955647, "learning_rate": 0.0008843056135238034, "loss": 3.9278, "step": 2137 }, { "epoch": 0.25, "grad_norm": 1.0103901366776658, "learning_rate": 0.0008841867947524349, "loss": 4.19, "step": 2138 }, { "epoch": 0.25, "grad_norm": 0.8016220832718985, "learning_rate": 0.0008840679229898211, "loss": 3.9543, "step": 2139 }, { "epoch": 0.25, "grad_norm": 1.3529026639719663, "learning_rate": 0.0008839489982523583, "loss": 3.9038, "step": 2140 }, { "epoch": 0.25, "grad_norm": 0.8135895635822074, "learning_rate": 0.00088383002055645, "loss": 4.1462, "step": 2141 }, { "epoch": 0.25, "grad_norm": 1.1214703937640158, "learning_rate": 0.000883710989918507, "loss": 3.9509, "step": 2142 }, { "epoch": 0.25, "grad_norm": 1.6383971039290965, "learning_rate": 0.0008835919063549469, "loss": 3.9357, "step": 2143 }, { "epoch": 0.25, "grad_norm": 0.9565223133745256, "learning_rate": 0.0008834727698821953, "loss": 3.8536, "step": 2144 }, { "epoch": 0.25, "grad_norm": 1.6673112977344864, "learning_rate": 0.0008833535805166848, "loss": 3.9877, "step": 2145 }, { "epoch": 0.25, "grad_norm": 2.35208010749955, "learning_rate": 0.0008832343382748551, "loss": 3.9816, "step": 2146 }, { "epoch": 0.25, "grad_norm": 1.2090152698672973, "learning_rate": 0.0008831150431731537, "loss": 3.8487, "step": 2147 }, { "epoch": 0.25, "grad_norm": 1.1306602102181618, "learning_rate": 0.0008829956952280349, "loss": 3.9179, "step": 2148 }, { "epoch": 0.25, "grad_norm": 0.9455926207790951, "learning_rate": 0.0008828762944559605, "loss": 4.1531, "step": 2149 }, { "epoch": 0.25, "grad_norm": 1.1025441859208456, "learning_rate": 0.0008827568408733996, "loss": 4.0575, "step": 2150 }, { "epoch": 0.25, "grad_norm": 1.393344825177647, "learning_rate": 0.0008826373344968285, "loss": 3.8496, "step": 2151 }, { "epoch": 0.25, "grad_norm": 5.854953083823742, "learning_rate": 0.0008825177753427309, "loss": 3.8603, "step": 2152 }, { "epoch": 0.25, "grad_norm": 0.980585894293123, "learning_rate": 0.0008823981634275975, "loss": 3.8725, "step": 2153 }, { "epoch": 0.25, "grad_norm": 0.7940970411263466, "learning_rate": 0.0008822784987679266, "loss": 3.8804, "step": 2154 }, { "epoch": 0.25, "grad_norm": 1.1433133310255805, "learning_rate": 0.0008821587813802237, "loss": 3.9821, "step": 2155 }, { "epoch": 0.25, "grad_norm": 0.8967270268159374, "learning_rate": 0.0008820390112810017, "loss": 3.8382, "step": 2156 }, { "epoch": 0.25, "grad_norm": 1.008532677741829, "learning_rate": 0.0008819191884867803, "loss": 3.7567, "step": 2157 }, { "epoch": 0.25, "grad_norm": 1.1149748715168486, "learning_rate": 0.0008817993130140869, "loss": 3.9639, "step": 2158 }, { "epoch": 0.25, "grad_norm": 1.3913163765937988, "learning_rate": 0.000881679384879456, "loss": 3.9564, "step": 2159 }, { "epoch": 0.25, "grad_norm": 0.8410834388747285, "learning_rate": 0.0008815594040994294, "loss": 3.9065, "step": 2160 }, { "epoch": 0.25, "grad_norm": 0.8157745315545364, "learning_rate": 0.0008814393706905564, "loss": 3.937, "step": 2161 }, { "epoch": 0.25, "grad_norm": 0.9230513150314431, "learning_rate": 0.000881319284669393, "loss": 3.8899, "step": 2162 }, { "epoch": 0.25, "grad_norm": 1.1157960998598506, "learning_rate": 0.0008811991460525027, "loss": 3.7821, "step": 2163 }, { "epoch": 0.25, "grad_norm": 0.8683618819996498, "learning_rate": 0.0008810789548564566, "loss": 3.9197, "step": 2164 }, { "epoch": 0.25, "grad_norm": 0.8414709544237686, "learning_rate": 0.0008809587110978328, "loss": 3.9722, "step": 2165 }, { "epoch": 0.25, "grad_norm": 0.9109897784652776, "learning_rate": 0.0008808384147932165, "loss": 4.0576, "step": 2166 }, { "epoch": 0.25, "grad_norm": 1.3025135833904837, "learning_rate": 0.0008807180659592004, "loss": 4.1156, "step": 2167 }, { "epoch": 0.25, "grad_norm": 1.4942420407295791, "learning_rate": 0.0008805976646123841, "loss": 3.947, "step": 2168 }, { "epoch": 0.25, "grad_norm": 1.1636944306380095, "learning_rate": 0.0008804772107693748, "loss": 3.7147, "step": 2169 }, { "epoch": 0.25, "grad_norm": 1.2032765980902034, "learning_rate": 0.000880356704446787, "loss": 3.8531, "step": 2170 }, { "epoch": 0.25, "grad_norm": 0.9642941362462173, "learning_rate": 0.000880236145661242, "loss": 3.9432, "step": 2171 }, { "epoch": 0.25, "grad_norm": 1.3287299201933072, "learning_rate": 0.0008801155344293686, "loss": 3.7789, "step": 2172 }, { "epoch": 0.25, "grad_norm": 0.9571600592331799, "learning_rate": 0.0008799948707678031, "loss": 3.7206, "step": 2173 }, { "epoch": 0.25, "grad_norm": 1.1930287755126634, "learning_rate": 0.0008798741546931883, "loss": 3.7978, "step": 2174 }, { "epoch": 0.25, "grad_norm": 1.0978495005436035, "learning_rate": 0.000879753386222175, "loss": 3.9923, "step": 2175 }, { "epoch": 0.25, "grad_norm": 0.972102856805826, "learning_rate": 0.0008796325653714208, "loss": 3.8492, "step": 2176 }, { "epoch": 0.25, "grad_norm": 1.109459985561427, "learning_rate": 0.0008795116921575907, "loss": 4.0082, "step": 2177 }, { "epoch": 0.25, "grad_norm": 0.8750351044200697, "learning_rate": 0.0008793907665973569, "loss": 3.939, "step": 2178 }, { "epoch": 0.25, "grad_norm": 0.975288225222675, "learning_rate": 0.0008792697887073986, "loss": 3.9931, "step": 2179 }, { "epoch": 0.25, "grad_norm": 1.0490283647306515, "learning_rate": 0.0008791487585044025, "loss": 4.0851, "step": 2180 }, { "epoch": 0.25, "grad_norm": 1.4491331274268424, "learning_rate": 0.0008790276760050624, "loss": 3.9673, "step": 2181 }, { "epoch": 0.25, "grad_norm": 0.9484689739688951, "learning_rate": 0.0008789065412260793, "loss": 4.1234, "step": 2182 }, { "epoch": 0.25, "grad_norm": 1.608058338743894, "learning_rate": 0.0008787853541841614, "loss": 4.2119, "step": 2183 }, { "epoch": 0.25, "grad_norm": 0.8339838951938581, "learning_rate": 0.0008786641148960243, "loss": 4.0447, "step": 2184 }, { "epoch": 0.25, "grad_norm": 1.105352784171064, "learning_rate": 0.0008785428233783905, "loss": 4.0469, "step": 2185 }, { "epoch": 0.25, "grad_norm": 1.3699290452645492, "learning_rate": 0.0008784214796479899, "loss": 3.8401, "step": 2186 }, { "epoch": 0.25, "grad_norm": 1.0928156557127404, "learning_rate": 0.0008783000837215596, "loss": 3.8539, "step": 2187 }, { "epoch": 0.25, "grad_norm": 0.8451914564557594, "learning_rate": 0.0008781786356158437, "loss": 3.8902, "step": 2188 }, { "epoch": 0.25, "grad_norm": 1.250896292796464, "learning_rate": 0.0008780571353475939, "loss": 3.8239, "step": 2189 }, { "epoch": 0.25, "grad_norm": 7.188174389539621, "learning_rate": 0.0008779355829335684, "loss": 3.9746, "step": 2190 }, { "epoch": 0.25, "grad_norm": 1.6125645630123637, "learning_rate": 0.0008778139783905337, "loss": 3.9128, "step": 2191 }, { "epoch": 0.25, "grad_norm": 1.1588128924686791, "learning_rate": 0.0008776923217352624, "loss": 3.7537, "step": 2192 }, { "epoch": 0.25, "grad_norm": 1.0558508920756022, "learning_rate": 0.0008775706129845347, "loss": 4.1037, "step": 2193 }, { "epoch": 0.25, "grad_norm": 0.8794816710799214, "learning_rate": 0.0008774488521551381, "loss": 4.0985, "step": 2194 }, { "epoch": 0.25, "grad_norm": 1.2758415995641579, "learning_rate": 0.0008773270392638671, "loss": 4.0272, "step": 2195 }, { "epoch": 0.25, "grad_norm": 1.4100833183672101, "learning_rate": 0.0008772051743275237, "loss": 4.1356, "step": 2196 }, { "epoch": 0.25, "grad_norm": 0.8823991527151862, "learning_rate": 0.0008770832573629166, "loss": 3.9129, "step": 2197 }, { "epoch": 0.25, "grad_norm": 0.8980561379603148, "learning_rate": 0.000876961288386862, "loss": 4.0147, "step": 2198 }, { "epoch": 0.25, "grad_norm": 0.9833206464699501, "learning_rate": 0.0008768392674161833, "loss": 3.6575, "step": 2199 }, { "epoch": 0.25, "grad_norm": 0.8922684652196814, "learning_rate": 0.0008767171944677108, "loss": 3.986, "step": 2200 }, { "epoch": 0.25, "grad_norm": 0.8732975600360913, "learning_rate": 0.0008765950695582821, "loss": 3.8476, "step": 2201 }, { "epoch": 0.25, "grad_norm": 0.9399675776230486, "learning_rate": 0.0008764728927047423, "loss": 4.1031, "step": 2202 }, { "epoch": 0.25, "grad_norm": 0.8592025405544144, "learning_rate": 0.0008763506639239432, "loss": 3.9165, "step": 2203 }, { "epoch": 0.25, "grad_norm": 0.9661237222930807, "learning_rate": 0.0008762283832327436, "loss": 4.0161, "step": 2204 }, { "epoch": 0.25, "grad_norm": 0.9228034294154707, "learning_rate": 0.0008761060506480103, "loss": 4.1599, "step": 2205 }, { "epoch": 0.25, "grad_norm": 1.2160951077094662, "learning_rate": 0.0008759836661866165, "loss": 3.666, "step": 2206 }, { "epoch": 0.25, "grad_norm": 0.8740025443113125, "learning_rate": 0.0008758612298654429, "loss": 4.0106, "step": 2207 }, { "epoch": 0.25, "grad_norm": 1.2622319434596756, "learning_rate": 0.0008757387417013772, "loss": 3.7535, "step": 2208 }, { "epoch": 0.25, "grad_norm": 1.185069122859782, "learning_rate": 0.0008756162017113144, "loss": 3.8854, "step": 2209 }, { "epoch": 0.25, "grad_norm": 0.9180279660712385, "learning_rate": 0.0008754936099121565, "loss": 4.0234, "step": 2210 }, { "epoch": 0.25, "grad_norm": 0.9107832116282002, "learning_rate": 0.0008753709663208125, "loss": 4.1241, "step": 2211 }, { "epoch": 0.25, "grad_norm": 1.2350656983555386, "learning_rate": 0.0008752482709541989, "loss": 3.9766, "step": 2212 }, { "epoch": 0.25, "grad_norm": 1.2525251043739827, "learning_rate": 0.0008751255238292392, "loss": 3.8812, "step": 2213 }, { "epoch": 0.25, "grad_norm": 1.0022312264179247, "learning_rate": 0.0008750027249628643, "loss": 4.1762, "step": 2214 }, { "epoch": 0.25, "grad_norm": 1.2226033561541223, "learning_rate": 0.0008748798743720115, "loss": 3.9444, "step": 2215 }, { "epoch": 0.25, "grad_norm": 0.9155371183563021, "learning_rate": 0.0008747569720736257, "loss": 4.0348, "step": 2216 }, { "epoch": 0.25, "grad_norm": 1.5761318906039694, "learning_rate": 0.0008746340180846595, "loss": 4.093, "step": 2217 }, { "epoch": 0.25, "grad_norm": 0.882751487423579, "learning_rate": 0.0008745110124220714, "loss": 3.9665, "step": 2218 }, { "epoch": 0.25, "grad_norm": 0.9751049390270244, "learning_rate": 0.000874387955102828, "loss": 4.1791, "step": 2219 }, { "epoch": 0.25, "grad_norm": 0.7817362804334659, "learning_rate": 0.0008742648461439028, "loss": 3.7654, "step": 2220 }, { "epoch": 0.25, "grad_norm": 1.2578195328587212, "learning_rate": 0.0008741416855622762, "loss": 3.9854, "step": 2221 }, { "epoch": 0.25, "grad_norm": 0.8525448939710215, "learning_rate": 0.0008740184733749357, "loss": 3.9989, "step": 2222 }, { "epoch": 0.25, "grad_norm": 0.9043568746530648, "learning_rate": 0.0008738952095988763, "loss": 4.1763, "step": 2223 }, { "epoch": 0.26, "grad_norm": 0.872149579199396, "learning_rate": 0.0008737718942510999, "loss": 4.0003, "step": 2224 }, { "epoch": 0.26, "grad_norm": 0.8777639260082888, "learning_rate": 0.0008736485273486155, "loss": 4.0726, "step": 2225 }, { "epoch": 0.26, "grad_norm": 0.8084549795123298, "learning_rate": 0.000873525108908439, "loss": 3.9261, "step": 2226 }, { "epoch": 0.26, "grad_norm": 0.856706794088904, "learning_rate": 0.0008734016389475938, "loss": 4.0497, "step": 2227 }, { "epoch": 0.26, "grad_norm": 2.214865945719517, "learning_rate": 0.0008732781174831103, "loss": 3.7595, "step": 2228 }, { "epoch": 0.26, "grad_norm": 0.9190914614178466, "learning_rate": 0.0008731545445320257, "loss": 4.0537, "step": 2229 }, { "epoch": 0.26, "grad_norm": 0.9665223469399523, "learning_rate": 0.0008730309201113847, "loss": 4.1073, "step": 2230 }, { "epoch": 0.26, "grad_norm": 0.9803214774678155, "learning_rate": 0.0008729072442382387, "loss": 4.0769, "step": 2231 }, { "epoch": 0.26, "grad_norm": 0.8315119413418715, "learning_rate": 0.0008727835169296469, "loss": 3.7484, "step": 2232 }, { "epoch": 0.26, "grad_norm": 15.654232523656251, "learning_rate": 0.0008726597382026747, "loss": 3.9204, "step": 2233 }, { "epoch": 0.26, "grad_norm": 0.916582886774442, "learning_rate": 0.0008725359080743951, "loss": 3.9736, "step": 2234 }, { "epoch": 0.26, "grad_norm": 0.7899450433922629, "learning_rate": 0.0008724120265618882, "loss": 3.8648, "step": 2235 }, { "epoch": 0.26, "grad_norm": 0.8215385030958859, "learning_rate": 0.0008722880936822411, "loss": 3.8936, "step": 2236 }, { "epoch": 0.26, "grad_norm": 0.7929997658747323, "learning_rate": 0.0008721641094525481, "loss": 3.9588, "step": 2237 }, { "epoch": 0.26, "grad_norm": 0.8408837816571936, "learning_rate": 0.0008720400738899101, "loss": 4.1097, "step": 2238 }, { "epoch": 0.26, "grad_norm": 0.90320705176028, "learning_rate": 0.0008719159870114356, "loss": 4.0359, "step": 2239 }, { "epoch": 0.26, "grad_norm": 1.0833230498833812, "learning_rate": 0.00087179184883424, "loss": 4.0967, "step": 2240 }, { "epoch": 0.26, "grad_norm": 0.9808588127492937, "learning_rate": 0.000871667659375446, "loss": 4.2344, "step": 2241 }, { "epoch": 0.26, "grad_norm": 1.304785842025187, "learning_rate": 0.0008715434186521831, "loss": 3.9667, "step": 2242 }, { "epoch": 0.26, "grad_norm": 1.224636112681155, "learning_rate": 0.0008714191266815877, "loss": 3.9017, "step": 2243 }, { "epoch": 0.26, "grad_norm": 1.0153208166432204, "learning_rate": 0.0008712947834808036, "loss": 3.9926, "step": 2244 }, { "epoch": 0.26, "grad_norm": 0.9099223991082382, "learning_rate": 0.0008711703890669818, "loss": 3.7267, "step": 2245 }, { "epoch": 0.26, "grad_norm": 1.4379620231078674, "learning_rate": 0.0008710459434572799, "loss": 3.9391, "step": 2246 }, { "epoch": 0.26, "grad_norm": 1.4272088377714736, "learning_rate": 0.0008709214466688629, "loss": 3.7905, "step": 2247 }, { "epoch": 0.26, "grad_norm": 0.9632218597396647, "learning_rate": 0.0008707968987189028, "loss": 3.9601, "step": 2248 }, { "epoch": 0.26, "grad_norm": 3.0130135194730396, "learning_rate": 0.0008706722996245784, "loss": 3.932, "step": 2249 }, { "epoch": 0.26, "grad_norm": 0.7786606357572302, "learning_rate": 0.0008705476494030762, "loss": 4.0162, "step": 2250 }, { "epoch": 0.26, "grad_norm": 1.065165758529379, "learning_rate": 0.0008704229480715887, "loss": 4.0203, "step": 2251 }, { "epoch": 0.26, "grad_norm": 0.9532642100658648, "learning_rate": 0.0008702981956473166, "loss": 3.9602, "step": 2252 }, { "epoch": 0.26, "grad_norm": 1.1070271178356093, "learning_rate": 0.0008701733921474671, "loss": 3.9558, "step": 2253 }, { "epoch": 0.26, "grad_norm": 0.9010833587313534, "learning_rate": 0.0008700485375892539, "loss": 4.2928, "step": 2254 }, { "epoch": 0.26, "grad_norm": 1.068048644214126, "learning_rate": 0.000869923631989899, "loss": 4.0186, "step": 2255 }, { "epoch": 0.26, "grad_norm": 1.8800153732187113, "learning_rate": 0.0008697986753666304, "loss": 3.9631, "step": 2256 }, { "epoch": 0.26, "grad_norm": 1.1207261183560644, "learning_rate": 0.0008696736677366834, "loss": 4.0456, "step": 2257 }, { "epoch": 0.26, "grad_norm": 0.8449727741032496, "learning_rate": 0.0008695486091173008, "loss": 4.0715, "step": 2258 }, { "epoch": 0.26, "grad_norm": 0.8947834470392755, "learning_rate": 0.0008694234995257318, "loss": 4.1547, "step": 2259 }, { "epoch": 0.26, "grad_norm": 1.2405092184886062, "learning_rate": 0.0008692983389792326, "loss": 4.2263, "step": 2260 }, { "epoch": 0.26, "grad_norm": 0.7155604785209384, "learning_rate": 0.0008691731274950671, "loss": 3.8411, "step": 2261 }, { "epoch": 0.26, "grad_norm": 1.233848653567123, "learning_rate": 0.0008690478650905059, "loss": 3.8386, "step": 2262 }, { "epoch": 0.26, "grad_norm": 2.025376352144121, "learning_rate": 0.0008689225517828263, "loss": 3.9018, "step": 2263 }, { "epoch": 0.26, "grad_norm": 0.8340903435424765, "learning_rate": 0.000868797187589313, "loss": 4.1776, "step": 2264 }, { "epoch": 0.26, "grad_norm": 0.9172241956422424, "learning_rate": 0.0008686717725272577, "loss": 4.0436, "step": 2265 }, { "epoch": 0.26, "grad_norm": 0.8865095359134835, "learning_rate": 0.0008685463066139587, "loss": 4.2418, "step": 2266 }, { "epoch": 0.26, "grad_norm": 0.7401890010350518, "learning_rate": 0.000868420789866722, "loss": 4.0356, "step": 2267 }, { "epoch": 0.26, "grad_norm": 1.3929086444151173, "learning_rate": 0.00086829522230286, "loss": 3.9131, "step": 2268 }, { "epoch": 0.26, "grad_norm": 2.584644498043084, "learning_rate": 0.0008681696039396924, "loss": 3.8776, "step": 2269 }, { "epoch": 0.26, "grad_norm": 0.9277478372594873, "learning_rate": 0.0008680439347945459, "loss": 4.0199, "step": 2270 }, { "epoch": 0.26, "grad_norm": 0.936168692550438, "learning_rate": 0.0008679182148847542, "loss": 4.184, "step": 2271 }, { "epoch": 0.26, "grad_norm": 0.8402365143215479, "learning_rate": 0.000867792444227658, "loss": 4.0714, "step": 2272 }, { "epoch": 0.26, "grad_norm": 1.0371185714185387, "learning_rate": 0.0008676666228406047, "loss": 3.8073, "step": 2273 }, { "epoch": 0.26, "grad_norm": 1.0874134887859563, "learning_rate": 0.0008675407507409492, "loss": 3.9982, "step": 2274 }, { "epoch": 0.26, "grad_norm": 1.2729194100667194, "learning_rate": 0.0008674148279460532, "loss": 3.9878, "step": 2275 }, { "epoch": 0.26, "grad_norm": 0.8879034535438304, "learning_rate": 0.0008672888544732851, "loss": 4.1824, "step": 2276 }, { "epoch": 0.26, "grad_norm": 1.2411873377903817, "learning_rate": 0.0008671628303400208, "loss": 4.3929, "step": 2277 }, { "epoch": 0.26, "grad_norm": 0.743813455062956, "learning_rate": 0.0008670367555636427, "loss": 3.9572, "step": 2278 }, { "epoch": 0.26, "grad_norm": 3.098198142229885, "learning_rate": 0.0008669106301615406, "loss": 3.7639, "step": 2279 }, { "epoch": 0.26, "grad_norm": 0.7963196281586753, "learning_rate": 0.0008667844541511109, "loss": 4.0079, "step": 2280 }, { "epoch": 0.26, "grad_norm": 2.473167386617907, "learning_rate": 0.0008666582275497575, "loss": 3.9405, "step": 2281 }, { "epoch": 0.26, "grad_norm": 0.9019680579547602, "learning_rate": 0.0008665319503748908, "loss": 4.0149, "step": 2282 }, { "epoch": 0.26, "grad_norm": 0.8675072665279613, "learning_rate": 0.0008664056226439281, "loss": 3.9511, "step": 2283 }, { "epoch": 0.26, "grad_norm": 0.8431210339972163, "learning_rate": 0.0008662792443742942, "loss": 3.9319, "step": 2284 }, { "epoch": 0.26, "grad_norm": 1.1023915387133725, "learning_rate": 0.0008661528155834203, "loss": 4.0171, "step": 2285 }, { "epoch": 0.26, "grad_norm": 0.8110344302300158, "learning_rate": 0.0008660263362887451, "loss": 3.8456, "step": 2286 }, { "epoch": 0.26, "grad_norm": 1.2040591297580245, "learning_rate": 0.000865899806507714, "loss": 4.0609, "step": 2287 }, { "epoch": 0.26, "grad_norm": 0.7885163827786756, "learning_rate": 0.0008657732262577791, "loss": 3.8487, "step": 2288 }, { "epoch": 0.26, "grad_norm": 0.8646045299042666, "learning_rate": 0.0008656465955564, "loss": 4.0526, "step": 2289 }, { "epoch": 0.26, "grad_norm": 0.836243134779185, "learning_rate": 0.0008655199144210428, "loss": 3.9338, "step": 2290 }, { "epoch": 0.26, "grad_norm": 0.9318064821485097, "learning_rate": 0.0008653931828691808, "loss": 4.1383, "step": 2291 }, { "epoch": 0.26, "grad_norm": 0.8669188891506721, "learning_rate": 0.0008652664009182945, "loss": 3.7397, "step": 2292 }, { "epoch": 0.26, "grad_norm": 2.229458703505447, "learning_rate": 0.0008651395685858708, "loss": 3.915, "step": 2293 }, { "epoch": 0.26, "grad_norm": 1.0246580727792376, "learning_rate": 0.0008650126858894035, "loss": 4.0846, "step": 2294 }, { "epoch": 0.26, "grad_norm": 1.2127325464064838, "learning_rate": 0.0008648857528463943, "loss": 3.6697, "step": 2295 }, { "epoch": 0.26, "grad_norm": 1.7359829533973816, "learning_rate": 0.0008647587694743506, "loss": 4.02, "step": 2296 }, { "epoch": 0.26, "grad_norm": 1.0362006564987034, "learning_rate": 0.0008646317357907877, "loss": 4.1268, "step": 2297 }, { "epoch": 0.26, "grad_norm": 0.8124200817805031, "learning_rate": 0.0008645046518132273, "loss": 4.0848, "step": 2298 }, { "epoch": 0.26, "grad_norm": 1.0634050412161502, "learning_rate": 0.0008643775175591983, "loss": 3.8077, "step": 2299 }, { "epoch": 0.26, "grad_norm": 1.217665118506841, "learning_rate": 0.0008642503330462364, "loss": 4.0004, "step": 2300 }, { "epoch": 0.26, "grad_norm": 0.856341376469498, "learning_rate": 0.0008641230982918844, "loss": 3.8989, "step": 2301 }, { "epoch": 0.26, "grad_norm": 1.2119826410751318, "learning_rate": 0.0008639958133136918, "loss": 3.8684, "step": 2302 }, { "epoch": 0.26, "grad_norm": 0.9366950550189749, "learning_rate": 0.000863868478129215, "loss": 3.8859, "step": 2303 }, { "epoch": 0.26, "grad_norm": 1.078632825294467, "learning_rate": 0.0008637410927560176, "loss": 3.789, "step": 2304 }, { "epoch": 0.26, "grad_norm": 1.2018344037679587, "learning_rate": 0.0008636136572116702, "loss": 4.0414, "step": 2305 }, { "epoch": 0.26, "grad_norm": 1.054857034598852, "learning_rate": 0.0008634861715137497, "loss": 4.3906, "step": 2306 }, { "epoch": 0.26, "grad_norm": 2.0636558822373994, "learning_rate": 0.0008633586356798406, "loss": 3.8945, "step": 2307 }, { "epoch": 0.26, "grad_norm": 1.4250235704661274, "learning_rate": 0.0008632310497275339, "loss": 4.2035, "step": 2308 }, { "epoch": 0.26, "grad_norm": 0.7642267677600353, "learning_rate": 0.0008631034136744278, "loss": 4.1318, "step": 2309 }, { "epoch": 0.26, "grad_norm": 1.83071167530359, "learning_rate": 0.0008629757275381272, "loss": 4.0662, "step": 2310 }, { "epoch": 0.26, "grad_norm": 0.9458579608211676, "learning_rate": 0.0008628479913362438, "loss": 3.8669, "step": 2311 }, { "epoch": 0.27, "grad_norm": 0.9823478081853559, "learning_rate": 0.0008627202050863966, "loss": 3.8454, "step": 2312 }, { "epoch": 0.27, "grad_norm": 0.9532519700922556, "learning_rate": 0.0008625923688062112, "loss": 3.8482, "step": 2313 }, { "epoch": 0.27, "grad_norm": 2.0185844014032712, "learning_rate": 0.0008624644825133201, "loss": 4.0369, "step": 2314 }, { "epoch": 0.27, "grad_norm": 0.7885495683199164, "learning_rate": 0.0008623365462253627, "loss": 3.9894, "step": 2315 }, { "epoch": 0.27, "grad_norm": 0.9917051398344386, "learning_rate": 0.0008622085599599857, "loss": 3.9779, "step": 2316 }, { "epoch": 0.27, "grad_norm": 0.813221775847479, "learning_rate": 0.0008620805237348422, "loss": 4.0815, "step": 2317 }, { "epoch": 0.27, "grad_norm": 1.066602141936342, "learning_rate": 0.0008619524375675922, "loss": 3.858, "step": 2318 }, { "epoch": 0.27, "grad_norm": 1.0384000679964036, "learning_rate": 0.0008618243014759028, "loss": 3.9809, "step": 2319 }, { "epoch": 0.27, "grad_norm": 0.9916335065180159, "learning_rate": 0.0008616961154774483, "loss": 4.0376, "step": 2320 }, { "epoch": 0.27, "grad_norm": 0.8436903553756063, "learning_rate": 0.0008615678795899091, "loss": 3.9075, "step": 2321 }, { "epoch": 0.27, "grad_norm": 0.7528995923143742, "learning_rate": 0.0008614395938309729, "loss": 3.9278, "step": 2322 }, { "epoch": 0.27, "grad_norm": 0.8948726339121946, "learning_rate": 0.0008613112582183345, "loss": 3.8493, "step": 2323 }, { "epoch": 0.27, "grad_norm": 0.8344080565445902, "learning_rate": 0.0008611828727696953, "loss": 4.0515, "step": 2324 }, { "epoch": 0.27, "grad_norm": 4.211528689063667, "learning_rate": 0.0008610544375027636, "loss": 4.1242, "step": 2325 }, { "epoch": 0.27, "grad_norm": 0.8972799344599799, "learning_rate": 0.0008609259524352544, "loss": 4.2552, "step": 2326 }, { "epoch": 0.27, "grad_norm": 3.518595809481156, "learning_rate": 0.00086079741758489, "loss": 3.9543, "step": 2327 }, { "epoch": 0.27, "grad_norm": 0.7446378781603242, "learning_rate": 0.0008606688329693994, "loss": 3.8925, "step": 2328 }, { "epoch": 0.27, "grad_norm": 0.7210541640183263, "learning_rate": 0.000860540198606518, "loss": 4.0561, "step": 2329 }, { "epoch": 0.27, "grad_norm": 0.8088778443747705, "learning_rate": 0.0008604115145139889, "loss": 3.8757, "step": 2330 }, { "epoch": 0.27, "grad_norm": 0.9338938983091791, "learning_rate": 0.0008602827807095614, "loss": 3.8671, "step": 2331 }, { "epoch": 0.27, "grad_norm": 0.7966328265302801, "learning_rate": 0.000860153997210992, "loss": 4.1208, "step": 2332 }, { "epoch": 0.27, "grad_norm": 0.8299627401811752, "learning_rate": 0.0008600251640360438, "loss": 3.9585, "step": 2333 }, { "epoch": 0.27, "grad_norm": 0.8636705474769473, "learning_rate": 0.0008598962812024868, "loss": 4.0285, "step": 2334 }, { "epoch": 0.27, "grad_norm": 1.1430596406916276, "learning_rate": 0.0008597673487280983, "loss": 3.811, "step": 2335 }, { "epoch": 0.27, "grad_norm": 1.1397790852980074, "learning_rate": 0.0008596383666306616, "loss": 3.754, "step": 2336 }, { "epoch": 0.27, "grad_norm": 0.8956110940339987, "learning_rate": 0.0008595093349279677, "loss": 4.0023, "step": 2337 }, { "epoch": 0.27, "grad_norm": 0.8922896522320081, "learning_rate": 0.000859380253637814, "loss": 4.0869, "step": 2338 }, { "epoch": 0.27, "grad_norm": 1.3105546552788425, "learning_rate": 0.0008592511227780045, "loss": 3.8349, "step": 2339 }, { "epoch": 0.27, "grad_norm": 0.9447062980982079, "learning_rate": 0.0008591219423663506, "loss": 4.1107, "step": 2340 }, { "epoch": 0.27, "grad_norm": 1.5662664543497387, "learning_rate": 0.0008589927124206702, "loss": 4.0338, "step": 2341 }, { "epoch": 0.27, "grad_norm": 7.089837456094443, "learning_rate": 0.0008588634329587884, "loss": 3.6357, "step": 2342 }, { "epoch": 0.27, "grad_norm": 0.87723158297215, "learning_rate": 0.0008587341039985363, "loss": 4.2388, "step": 2343 }, { "epoch": 0.27, "grad_norm": 1.1654864399883182, "learning_rate": 0.0008586047255577527, "loss": 3.9486, "step": 2344 }, { "epoch": 0.27, "grad_norm": 2.940298226490436, "learning_rate": 0.000858475297654283, "loss": 3.9789, "step": 2345 }, { "epoch": 0.27, "grad_norm": 2.0937603359552845, "learning_rate": 0.0008583458203059791, "loss": 4.0611, "step": 2346 }, { "epoch": 0.27, "grad_norm": 1.1212960162700234, "learning_rate": 0.0008582162935306998, "loss": 4.1465, "step": 2347 }, { "epoch": 0.27, "grad_norm": 0.9905823872930529, "learning_rate": 0.0008580867173463112, "loss": 4.0083, "step": 2348 }, { "epoch": 0.27, "grad_norm": 0.9503352119113785, "learning_rate": 0.0008579570917706857, "loss": 3.9488, "step": 2349 }, { "epoch": 0.27, "grad_norm": 0.7780988316999806, "learning_rate": 0.0008578274168217026, "loss": 3.824, "step": 2350 }, { "epoch": 0.27, "grad_norm": 1.2052221895063258, "learning_rate": 0.0008576976925172481, "loss": 3.8507, "step": 2351 }, { "epoch": 0.27, "grad_norm": 1.437785735418753, "learning_rate": 0.0008575679188752154, "loss": 4.0437, "step": 2352 }, { "epoch": 0.27, "grad_norm": 1.8177488976281853, "learning_rate": 0.0008574380959135042, "loss": 4.1244, "step": 2353 }, { "epoch": 0.27, "grad_norm": 1.8079703217847076, "learning_rate": 0.0008573082236500209, "loss": 3.8013, "step": 2354 }, { "epoch": 0.27, "grad_norm": 1.428752426025728, "learning_rate": 0.0008571783021026791, "loss": 4.0086, "step": 2355 }, { "epoch": 0.27, "grad_norm": 0.8978124717388525, "learning_rate": 0.000857048331289399, "loss": 3.946, "step": 2356 }, { "epoch": 0.27, "grad_norm": 1.4297854223823323, "learning_rate": 0.0008569183112281075, "loss": 4.0231, "step": 2357 }, { "epoch": 0.27, "grad_norm": 0.8166821543176173, "learning_rate": 0.0008567882419367386, "loss": 3.7117, "step": 2358 }, { "epoch": 0.27, "grad_norm": 2.312615425732293, "learning_rate": 0.0008566581234332327, "loss": 3.8147, "step": 2359 }, { "epoch": 0.27, "grad_norm": 0.9761729700983796, "learning_rate": 0.000856527955735537, "loss": 3.7762, "step": 2360 }, { "epoch": 0.27, "grad_norm": 1.03125532656354, "learning_rate": 0.000856397738861606, "loss": 4.1011, "step": 2361 }, { "epoch": 0.27, "grad_norm": 2.157062419850866, "learning_rate": 0.0008562674728294004, "loss": 4.0426, "step": 2362 }, { "epoch": 0.27, "grad_norm": 0.8481128884502508, "learning_rate": 0.0008561371576568881, "loss": 3.9313, "step": 2363 }, { "epoch": 0.27, "grad_norm": 1.205283312333535, "learning_rate": 0.0008560067933620435, "loss": 3.7673, "step": 2364 }, { "epoch": 0.27, "grad_norm": 0.9388705811889348, "learning_rate": 0.0008558763799628477, "loss": 3.8873, "step": 2365 }, { "epoch": 0.27, "grad_norm": 1.4361662675835365, "learning_rate": 0.000855745917477289, "loss": 3.8666, "step": 2366 }, { "epoch": 0.27, "grad_norm": 1.3931518172580624, "learning_rate": 0.0008556154059233622, "loss": 3.9649, "step": 2367 }, { "epoch": 0.27, "grad_norm": 0.8790689323308412, "learning_rate": 0.0008554848453190686, "loss": 3.9988, "step": 2368 }, { "epoch": 0.27, "grad_norm": 0.9077582168872002, "learning_rate": 0.0008553542356824168, "loss": 3.8141, "step": 2369 }, { "epoch": 0.27, "grad_norm": 1.0808846166702613, "learning_rate": 0.0008552235770314221, "loss": 3.9715, "step": 2370 }, { "epoch": 0.27, "grad_norm": 0.9374076799815033, "learning_rate": 0.0008550928693841058, "loss": 4.0946, "step": 2371 }, { "epoch": 0.27, "grad_norm": 1.244667541894412, "learning_rate": 0.0008549621127584971, "loss": 4.0222, "step": 2372 }, { "epoch": 0.27, "grad_norm": 0.9220180581708312, "learning_rate": 0.000854831307172631, "loss": 3.9461, "step": 2373 }, { "epoch": 0.27, "grad_norm": 0.8950489364658922, "learning_rate": 0.0008547004526445499, "loss": 3.9416, "step": 2374 }, { "epoch": 0.27, "grad_norm": 0.8946336455049452, "learning_rate": 0.0008545695491923024, "loss": 4.1061, "step": 2375 }, { "epoch": 0.27, "grad_norm": 0.8905888344587501, "learning_rate": 0.0008544385968339445, "loss": 4.0681, "step": 2376 }, { "epoch": 0.27, "grad_norm": 1.021490276058496, "learning_rate": 0.0008543075955875382, "loss": 3.7954, "step": 2377 }, { "epoch": 0.27, "grad_norm": 0.8684064495513885, "learning_rate": 0.0008541765454711527, "loss": 3.9764, "step": 2378 }, { "epoch": 0.27, "grad_norm": 1.193306712644163, "learning_rate": 0.0008540454465028643, "loss": 3.9195, "step": 2379 }, { "epoch": 0.27, "grad_norm": 0.9209645051689618, "learning_rate": 0.0008539142987007551, "loss": 4.1639, "step": 2380 }, { "epoch": 0.27, "grad_norm": 1.1076863936689587, "learning_rate": 0.0008537831020829147, "loss": 3.7727, "step": 2381 }, { "epoch": 0.27, "grad_norm": 1.174816944753777, "learning_rate": 0.0008536518566674389, "loss": 4.1595, "step": 2382 }, { "epoch": 0.27, "grad_norm": 0.7925995192663658, "learning_rate": 0.0008535205624724309, "loss": 3.9861, "step": 2383 }, { "epoch": 0.27, "grad_norm": 2.4460656490752903, "learning_rate": 0.0008533892195159999, "loss": 3.9659, "step": 2384 }, { "epoch": 0.27, "grad_norm": 0.8170308608980023, "learning_rate": 0.0008532578278162624, "loss": 3.8945, "step": 2385 }, { "epoch": 0.27, "grad_norm": 0.8914970572660907, "learning_rate": 0.0008531263873913411, "loss": 3.9662, "step": 2386 }, { "epoch": 0.27, "grad_norm": 0.7924959856420668, "learning_rate": 0.0008529948982593658, "loss": 3.9403, "step": 2387 }, { "epoch": 0.27, "grad_norm": 1.4274439646611874, "learning_rate": 0.0008528633604384733, "loss": 3.8603, "step": 2388 }, { "epoch": 0.27, "grad_norm": 0.8647131574332183, "learning_rate": 0.0008527317739468061, "loss": 3.8007, "step": 2389 }, { "epoch": 0.27, "grad_norm": 1.1501723387430487, "learning_rate": 0.0008526001388025145, "loss": 4.0012, "step": 2390 }, { "epoch": 0.27, "grad_norm": 0.9410878279862962, "learning_rate": 0.0008524684550237549, "loss": 4.1803, "step": 2391 }, { "epoch": 0.27, "grad_norm": 0.8905419400169637, "learning_rate": 0.0008523367226286907, "loss": 3.6822, "step": 2392 }, { "epoch": 0.27, "grad_norm": 0.9525709149060198, "learning_rate": 0.0008522049416354915, "loss": 4.0398, "step": 2393 }, { "epoch": 0.27, "grad_norm": 1.1138407519880187, "learning_rate": 0.0008520731120623344, "loss": 4.0924, "step": 2394 }, { "epoch": 0.27, "grad_norm": 0.9314436634339874, "learning_rate": 0.0008519412339274027, "loss": 3.9124, "step": 2395 }, { "epoch": 0.27, "grad_norm": 4.622007815584016, "learning_rate": 0.0008518093072488863, "loss": 4.0434, "step": 2396 }, { "epoch": 0.27, "grad_norm": 1.8832561502133738, "learning_rate": 0.000851677332044982, "loss": 3.9738, "step": 2397 }, { "epoch": 0.27, "grad_norm": 1.1340076202410374, "learning_rate": 0.0008515453083338935, "loss": 4.3007, "step": 2398 }, { "epoch": 0.28, "grad_norm": 0.7978313662030689, "learning_rate": 0.0008514132361338306, "loss": 3.9985, "step": 2399 }, { "epoch": 0.28, "grad_norm": 0.9060138175473127, "learning_rate": 0.0008512811154630104, "loss": 3.9458, "step": 2400 }, { "epoch": 0.28, "grad_norm": 1.7252603690365107, "learning_rate": 0.0008511489463396563, "loss": 3.825, "step": 2401 }, { "epoch": 0.28, "grad_norm": 0.9307272743958461, "learning_rate": 0.0008510167287819986, "loss": 4.0215, "step": 2402 }, { "epoch": 0.28, "grad_norm": 0.8256752545788854, "learning_rate": 0.0008508844628082741, "loss": 3.7148, "step": 2403 }, { "epoch": 0.28, "grad_norm": 0.8345271120027508, "learning_rate": 0.0008507521484367265, "loss": 3.9442, "step": 2404 }, { "epoch": 0.28, "grad_norm": 1.419790157505972, "learning_rate": 0.0008506197856856059, "loss": 4.3902, "step": 2405 }, { "epoch": 0.28, "grad_norm": 0.9466958102690043, "learning_rate": 0.0008504873745731694, "loss": 3.9798, "step": 2406 }, { "epoch": 0.28, "grad_norm": 0.9189089691588663, "learning_rate": 0.0008503549151176804, "loss": 3.9273, "step": 2407 }, { "epoch": 0.28, "grad_norm": 0.93412414724999, "learning_rate": 0.0008502224073374092, "loss": 4.1825, "step": 2408 }, { "epoch": 0.28, "grad_norm": 0.7446475738591471, "learning_rate": 0.0008500898512506328, "loss": 3.9514, "step": 2409 }, { "epoch": 0.28, "grad_norm": 1.0224528305426162, "learning_rate": 0.0008499572468756347, "loss": 3.7627, "step": 2410 }, { "epoch": 0.28, "grad_norm": 1.4730710649464074, "learning_rate": 0.0008498245942307052, "loss": 3.8372, "step": 2411 }, { "epoch": 0.28, "grad_norm": 1.6612446111676398, "learning_rate": 0.0008496918933341413, "loss": 3.8287, "step": 2412 }, { "epoch": 0.28, "grad_norm": 1.030975336175758, "learning_rate": 0.0008495591442042463, "loss": 3.9179, "step": 2413 }, { "epoch": 0.28, "grad_norm": 1.660388431170337, "learning_rate": 0.0008494263468593307, "loss": 3.8037, "step": 2414 }, { "epoch": 0.28, "grad_norm": 1.012559669865874, "learning_rate": 0.000849293501317711, "loss": 3.9439, "step": 2415 }, { "epoch": 0.28, "grad_norm": 1.235489332297091, "learning_rate": 0.000849160607597711, "loss": 4.0267, "step": 2416 }, { "epoch": 0.28, "grad_norm": 1.0266628820077952, "learning_rate": 0.000849027665717661, "loss": 4.1277, "step": 2417 }, { "epoch": 0.28, "grad_norm": 0.9630062302560293, "learning_rate": 0.0008488946756958973, "loss": 3.8781, "step": 2418 }, { "epoch": 0.28, "grad_norm": 0.9252125402558702, "learning_rate": 0.0008487616375507639, "loss": 4.0576, "step": 2419 }, { "epoch": 0.28, "grad_norm": 1.0835749763330824, "learning_rate": 0.0008486285513006104, "loss": 3.7668, "step": 2420 }, { "epoch": 0.28, "grad_norm": 1.1692608770105173, "learning_rate": 0.0008484954169637937, "loss": 4.1083, "step": 2421 }, { "epoch": 0.28, "grad_norm": 0.789581339529072, "learning_rate": 0.0008483622345586774, "loss": 3.8249, "step": 2422 }, { "epoch": 0.28, "grad_norm": 0.9042337106347419, "learning_rate": 0.0008482290041036309, "loss": 3.894, "step": 2423 }, { "epoch": 0.28, "grad_norm": 0.8834036697770259, "learning_rate": 0.0008480957256170314, "loss": 3.9011, "step": 2424 }, { "epoch": 0.28, "grad_norm": 0.8653942474839383, "learning_rate": 0.0008479623991172618, "loss": 3.8687, "step": 2425 }, { "epoch": 0.28, "grad_norm": 1.0607202822225854, "learning_rate": 0.000847829024622712, "loss": 3.7165, "step": 2426 }, { "epoch": 0.28, "grad_norm": 0.7386733596590822, "learning_rate": 0.0008476956021517783, "loss": 4.0152, "step": 2427 }, { "epoch": 0.28, "grad_norm": 1.20233981165674, "learning_rate": 0.0008475621317228641, "loss": 4.0393, "step": 2428 }, { "epoch": 0.28, "grad_norm": 1.0158778176971124, "learning_rate": 0.000847428613354379, "loss": 3.9656, "step": 2429 }, { "epoch": 0.28, "grad_norm": 1.0127069781541227, "learning_rate": 0.0008472950470647393, "loss": 3.898, "step": 2430 }, { "epoch": 0.28, "grad_norm": 3.9752877038773407, "learning_rate": 0.0008471614328723678, "loss": 3.9128, "step": 2431 }, { "epoch": 0.28, "grad_norm": 1.006977573179346, "learning_rate": 0.0008470277707956943, "loss": 4.136, "step": 2432 }, { "epoch": 0.28, "grad_norm": 1.0144407738135002, "learning_rate": 0.0008468940608531546, "loss": 3.9087, "step": 2433 }, { "epoch": 0.28, "grad_norm": 0.9163168449654888, "learning_rate": 0.0008467603030631916, "loss": 3.988, "step": 2434 }, { "epoch": 0.28, "grad_norm": 1.0306504446365015, "learning_rate": 0.0008466264974442548, "loss": 4.1417, "step": 2435 }, { "epoch": 0.28, "grad_norm": 0.933930062984153, "learning_rate": 0.0008464926440147998, "loss": 3.9053, "step": 2436 }, { "epoch": 0.28, "grad_norm": 0.9517894603695751, "learning_rate": 0.0008463587427932895, "loss": 3.8005, "step": 2437 }, { "epoch": 0.28, "grad_norm": 0.8231089862218388, "learning_rate": 0.0008462247937981928, "loss": 3.9135, "step": 2438 }, { "epoch": 0.28, "grad_norm": 1.07667337221273, "learning_rate": 0.0008460907970479853, "loss": 4.1334, "step": 2439 }, { "epoch": 0.28, "grad_norm": 2.951405992301413, "learning_rate": 0.0008459567525611496, "loss": 4.0809, "step": 2440 }, { "epoch": 0.28, "grad_norm": 0.7724578706029513, "learning_rate": 0.0008458226603561742, "loss": 3.7629, "step": 2441 }, { "epoch": 0.28, "grad_norm": 0.9245468861807892, "learning_rate": 0.0008456885204515549, "loss": 4.1396, "step": 2442 }, { "epoch": 0.28, "grad_norm": 0.9218353428253573, "learning_rate": 0.0008455543328657937, "loss": 3.7689, "step": 2443 }, { "epoch": 0.28, "grad_norm": 0.8423110745665592, "learning_rate": 0.0008454200976173991, "loss": 4.1144, "step": 2444 }, { "epoch": 0.28, "grad_norm": 0.974168870082668, "learning_rate": 0.0008452858147248863, "loss": 4.067, "step": 2445 }, { "epoch": 0.28, "grad_norm": 0.8429801049366967, "learning_rate": 0.0008451514842067771, "loss": 3.8523, "step": 2446 }, { "epoch": 0.28, "grad_norm": 0.9356896827949951, "learning_rate": 0.0008450171060815999, "loss": 4.0638, "step": 2447 }, { "epoch": 0.28, "grad_norm": 0.6838082983382259, "learning_rate": 0.0008448826803678896, "loss": 3.919, "step": 2448 }, { "epoch": 0.28, "grad_norm": 1.697409565700111, "learning_rate": 0.0008447482070841875, "loss": 4.2329, "step": 2449 }, { "epoch": 0.28, "grad_norm": 0.93922646914711, "learning_rate": 0.0008446136862490417, "loss": 3.7565, "step": 2450 }, { "epoch": 0.28, "grad_norm": 0.9375997224220436, "learning_rate": 0.0008444791178810068, "loss": 3.9333, "step": 2451 }, { "epoch": 0.28, "grad_norm": 0.9588063159556814, "learning_rate": 0.0008443445019986441, "loss": 3.922, "step": 2452 }, { "epoch": 0.28, "grad_norm": 0.7803687571057546, "learning_rate": 0.0008442098386205211, "loss": 3.8363, "step": 2453 }, { "epoch": 0.28, "grad_norm": 1.535617549129699, "learning_rate": 0.0008440751277652122, "loss": 4.0624, "step": 2454 }, { "epoch": 0.28, "grad_norm": 1.5985207572215492, "learning_rate": 0.0008439403694512978, "loss": 3.8079, "step": 2455 }, { "epoch": 0.28, "grad_norm": 1.0711397460382241, "learning_rate": 0.0008438055636973657, "loss": 3.843, "step": 2456 }, { "epoch": 0.28, "grad_norm": 1.4915103979114503, "learning_rate": 0.0008436707105220096, "loss": 3.985, "step": 2457 }, { "epoch": 0.28, "grad_norm": 1.645506389993902, "learning_rate": 0.00084353580994383, "loss": 3.9793, "step": 2458 }, { "epoch": 0.28, "grad_norm": 1.2806541059718788, "learning_rate": 0.0008434008619814337, "loss": 3.7434, "step": 2459 }, { "epoch": 0.28, "grad_norm": 0.8298393289794685, "learning_rate": 0.0008432658666534345, "loss": 3.9055, "step": 2460 }, { "epoch": 0.28, "grad_norm": 0.778174582231486, "learning_rate": 0.0008431308239784521, "loss": 4.0774, "step": 2461 }, { "epoch": 0.28, "grad_norm": 1.0408576146964232, "learning_rate": 0.0008429957339751132, "loss": 3.6861, "step": 2462 }, { "epoch": 0.28, "grad_norm": 0.8779369241649156, "learning_rate": 0.0008428605966620508, "loss": 3.8307, "step": 2463 }, { "epoch": 0.28, "grad_norm": 2.3329880524677193, "learning_rate": 0.0008427254120579047, "loss": 4.0197, "step": 2464 }, { "epoch": 0.28, "grad_norm": 0.848737294535907, "learning_rate": 0.0008425901801813212, "loss": 4.272, "step": 2465 }, { "epoch": 0.28, "grad_norm": 1.422271299513801, "learning_rate": 0.0008424549010509524, "loss": 3.9314, "step": 2466 }, { "epoch": 0.28, "grad_norm": 0.7358760459343417, "learning_rate": 0.0008423195746854578, "loss": 3.8376, "step": 2467 }, { "epoch": 0.28, "grad_norm": 4.059424966577902, "learning_rate": 0.000842184201103503, "loss": 4.0982, "step": 2468 }, { "epoch": 0.28, "grad_norm": 1.3367118749951208, "learning_rate": 0.0008420487803237604, "loss": 3.999, "step": 2469 }, { "epoch": 0.28, "grad_norm": 0.9337304602484511, "learning_rate": 0.0008419133123649088, "loss": 4.0148, "step": 2470 }, { "epoch": 0.28, "grad_norm": 0.9146796382131649, "learning_rate": 0.0008417777972456328, "loss": 3.881, "step": 2471 }, { "epoch": 0.28, "grad_norm": 0.7654431724446443, "learning_rate": 0.0008416422349846249, "loss": 4.0127, "step": 2472 }, { "epoch": 0.28, "grad_norm": 0.8633917664725508, "learning_rate": 0.0008415066256005827, "loss": 4.176, "step": 2473 }, { "epoch": 0.28, "grad_norm": 1.3395980494537783, "learning_rate": 0.0008413709691122115, "loss": 3.6748, "step": 2474 }, { "epoch": 0.28, "grad_norm": 1.1355740086899375, "learning_rate": 0.0008412352655382221, "loss": 4.0731, "step": 2475 }, { "epoch": 0.28, "grad_norm": 3.145278251280664, "learning_rate": 0.0008410995148973323, "loss": 3.9108, "step": 2476 }, { "epoch": 0.28, "grad_norm": 0.8578902954102323, "learning_rate": 0.0008409637172082664, "loss": 3.7965, "step": 2477 }, { "epoch": 0.28, "grad_norm": 1.030008109626898, "learning_rate": 0.0008408278724897551, "loss": 3.9008, "step": 2478 }, { "epoch": 0.28, "grad_norm": 1.42498469750493, "learning_rate": 0.0008406919807605356, "loss": 4.0125, "step": 2479 }, { "epoch": 0.28, "grad_norm": 0.8024605517923418, "learning_rate": 0.0008405560420393515, "loss": 4.0033, "step": 2480 }, { "epoch": 0.28, "grad_norm": 1.2679411537128624, "learning_rate": 0.000840420056344953, "loss": 3.9603, "step": 2481 }, { "epoch": 0.28, "grad_norm": 1.602582744373343, "learning_rate": 0.0008402840236960967, "loss": 3.8511, "step": 2482 }, { "epoch": 0.28, "grad_norm": 1.1765441185448013, "learning_rate": 0.0008401479441115456, "loss": 3.9036, "step": 2483 }, { "epoch": 0.28, "grad_norm": 0.8451650716554956, "learning_rate": 0.0008400118176100697, "loss": 3.8307, "step": 2484 }, { "epoch": 0.28, "grad_norm": 0.9180695776763852, "learning_rate": 0.0008398756442104446, "loss": 3.9457, "step": 2485 }, { "epoch": 0.29, "grad_norm": 1.0183185970491904, "learning_rate": 0.0008397394239314529, "loss": 4.0379, "step": 2486 }, { "epoch": 0.29, "grad_norm": 0.8550618648533299, "learning_rate": 0.0008396031567918839, "loss": 3.9988, "step": 2487 }, { "epoch": 0.29, "grad_norm": 0.8566472396899563, "learning_rate": 0.0008394668428105327, "loss": 4.0206, "step": 2488 }, { "epoch": 0.29, "grad_norm": 1.1481786029396042, "learning_rate": 0.0008393304820062016, "loss": 3.9911, "step": 2489 }, { "epoch": 0.29, "grad_norm": 2.454735916892198, "learning_rate": 0.0008391940743976984, "loss": 3.9965, "step": 2490 }, { "epoch": 0.29, "grad_norm": 3.9900803630929533, "learning_rate": 0.0008390576200038385, "loss": 3.9428, "step": 2491 }, { "epoch": 0.29, "grad_norm": 0.9122618980703318, "learning_rate": 0.0008389211188434429, "loss": 4.351, "step": 2492 }, { "epoch": 0.29, "grad_norm": 2.2354421888069638, "learning_rate": 0.0008387845709353392, "loss": 3.9438, "step": 2493 }, { "epoch": 0.29, "grad_norm": 0.8016331331828656, "learning_rate": 0.000838647976298362, "loss": 3.8543, "step": 2494 }, { "epoch": 0.29, "grad_norm": 1.1056415270477746, "learning_rate": 0.0008385113349513516, "loss": 3.9781, "step": 2495 }, { "epoch": 0.29, "grad_norm": 0.8409758368379602, "learning_rate": 0.0008383746469131551, "loss": 3.6737, "step": 2496 }, { "epoch": 0.29, "grad_norm": 1.3132248810431797, "learning_rate": 0.0008382379122026263, "loss": 3.7955, "step": 2497 }, { "epoch": 0.29, "grad_norm": 1.2956422441056366, "learning_rate": 0.0008381011308386246, "loss": 3.9036, "step": 2498 }, { "epoch": 0.29, "grad_norm": 1.491365463034812, "learning_rate": 0.0008379643028400168, "loss": 4.2005, "step": 2499 }, { "epoch": 0.29, "grad_norm": 1.4541524540428192, "learning_rate": 0.0008378274282256757, "loss": 4.0739, "step": 2500 }, { "epoch": 0.29, "grad_norm": 1.0130331462790134, "learning_rate": 0.0008376905070144804, "loss": 3.8546, "step": 2501 }, { "epoch": 0.29, "grad_norm": 1.7066568677417842, "learning_rate": 0.0008375535392253166, "loss": 4.1178, "step": 2502 }, { "epoch": 0.29, "grad_norm": 1.2630416012052208, "learning_rate": 0.0008374165248770764, "loss": 3.9189, "step": 2503 }, { "epoch": 0.29, "grad_norm": 0.9370637504559649, "learning_rate": 0.0008372794639886583, "loss": 4.3103, "step": 2504 }, { "epoch": 0.29, "grad_norm": 0.968137378327697, "learning_rate": 0.0008371423565789674, "loss": 4.2028, "step": 2505 }, { "epoch": 0.29, "grad_norm": 0.8657694484158494, "learning_rate": 0.0008370052026669149, "loss": 3.9221, "step": 2506 }, { "epoch": 0.29, "grad_norm": 1.089490063647127, "learning_rate": 0.0008368680022714186, "loss": 3.9837, "step": 2507 }, { "epoch": 0.29, "grad_norm": 1.029690999553673, "learning_rate": 0.0008367307554114025, "loss": 3.8629, "step": 2508 }, { "epoch": 0.29, "grad_norm": 0.8607440233813193, "learning_rate": 0.0008365934621057976, "loss": 3.9438, "step": 2509 }, { "epoch": 0.29, "grad_norm": 1.3948281195944314, "learning_rate": 0.0008364561223735405, "loss": 3.9469, "step": 2510 }, { "epoch": 0.29, "grad_norm": 0.9604351471753871, "learning_rate": 0.0008363187362335749, "loss": 3.9984, "step": 2511 }, { "epoch": 0.29, "grad_norm": 0.9324156232907009, "learning_rate": 0.0008361813037048503, "loss": 3.9987, "step": 2512 }, { "epoch": 0.29, "grad_norm": 1.4167123708130525, "learning_rate": 0.0008360438248063231, "loss": 4.0149, "step": 2513 }, { "epoch": 0.29, "grad_norm": 1.5604939028661955, "learning_rate": 0.0008359062995569559, "loss": 3.9779, "step": 2514 }, { "epoch": 0.29, "grad_norm": 0.8643734045296375, "learning_rate": 0.0008357687279757177, "loss": 3.9261, "step": 2515 }, { "epoch": 0.29, "grad_norm": 0.884787736920026, "learning_rate": 0.0008356311100815837, "loss": 4.0442, "step": 2516 }, { "epoch": 0.29, "grad_norm": 0.8169386742198496, "learning_rate": 0.0008354934458935357, "loss": 4.2266, "step": 2517 }, { "epoch": 0.29, "grad_norm": 1.3137352530351625, "learning_rate": 0.0008353557354305621, "loss": 3.9403, "step": 2518 }, { "epoch": 0.29, "grad_norm": 0.9103199389180696, "learning_rate": 0.0008352179787116572, "loss": 3.8276, "step": 2519 }, { "epoch": 0.29, "grad_norm": 1.043572435271193, "learning_rate": 0.000835080175755822, "loss": 4.0615, "step": 2520 }, { "epoch": 0.29, "grad_norm": 1.705412667539926, "learning_rate": 0.0008349423265820636, "loss": 3.9935, "step": 2521 }, { "epoch": 0.29, "grad_norm": 0.8334230952862067, "learning_rate": 0.0008348044312093959, "loss": 3.871, "step": 2522 }, { "epoch": 0.29, "grad_norm": 1.5148542385631716, "learning_rate": 0.0008346664896568389, "loss": 3.8258, "step": 2523 }, { "epoch": 0.29, "grad_norm": 48.12196217632118, "learning_rate": 0.000834528501943419, "loss": 3.9217, "step": 2524 }, { "epoch": 0.29, "grad_norm": 1.0616361039856375, "learning_rate": 0.000834390468088169, "loss": 3.7271, "step": 2525 }, { "epoch": 0.29, "grad_norm": 0.811313959941039, "learning_rate": 0.0008342523881101279, "loss": 3.7826, "step": 2526 }, { "epoch": 0.29, "grad_norm": 1.3948039154426823, "learning_rate": 0.0008341142620283412, "loss": 4.0107, "step": 2527 }, { "epoch": 0.29, "grad_norm": 1.9359860257054848, "learning_rate": 0.0008339760898618611, "loss": 3.8204, "step": 2528 }, { "epoch": 0.29, "grad_norm": 2.857753980181266, "learning_rate": 0.0008338378716297454, "loss": 3.9814, "step": 2529 }, { "epoch": 0.29, "grad_norm": 1.0743169042087388, "learning_rate": 0.0008336996073510589, "loss": 3.9242, "step": 2530 }, { "epoch": 0.29, "grad_norm": 1.2431141292163221, "learning_rate": 0.0008335612970448723, "loss": 4.119, "step": 2531 }, { "epoch": 0.29, "grad_norm": 1.4514040767761345, "learning_rate": 0.0008334229407302632, "loss": 4.0033, "step": 2532 }, { "epoch": 0.29, "grad_norm": 1.2203664763379505, "learning_rate": 0.000833284538426315, "loss": 4.0969, "step": 2533 }, { "epoch": 0.29, "grad_norm": 1.6929942275366165, "learning_rate": 0.0008331460901521178, "loss": 4.2063, "step": 2534 }, { "epoch": 0.29, "grad_norm": 1.099191042305377, "learning_rate": 0.0008330075959267677, "loss": 3.9605, "step": 2535 }, { "epoch": 0.29, "grad_norm": 2.947805418743882, "learning_rate": 0.0008328690557693674, "loss": 4.1005, "step": 2536 }, { "epoch": 0.29, "grad_norm": 1.3762931335840163, "learning_rate": 0.000832730469699026, "loss": 4.1235, "step": 2537 }, { "epoch": 0.29, "grad_norm": 1.0567889180196408, "learning_rate": 0.0008325918377348587, "loss": 3.986, "step": 2538 }, { "epoch": 0.29, "grad_norm": 1.1316496284251936, "learning_rate": 0.0008324531598959871, "loss": 4.1584, "step": 2539 }, { "epoch": 0.29, "grad_norm": 0.9971163380251649, "learning_rate": 0.0008323144362015393, "loss": 3.9662, "step": 2540 }, { "epoch": 0.29, "grad_norm": 1.0907122371451121, "learning_rate": 0.0008321756666706495, "loss": 3.9884, "step": 2541 }, { "epoch": 0.29, "grad_norm": 2.674521419113746, "learning_rate": 0.0008320368513224584, "loss": 3.9275, "step": 2542 }, { "epoch": 0.29, "grad_norm": 0.9490718509625404, "learning_rate": 0.0008318979901761128, "loss": 4.2282, "step": 2543 }, { "epoch": 0.29, "grad_norm": 0.9860604246275796, "learning_rate": 0.000831759083250766, "loss": 4.0928, "step": 2544 }, { "epoch": 0.29, "grad_norm": 0.7699856134019598, "learning_rate": 0.0008316201305655775, "loss": 3.9339, "step": 2545 }, { "epoch": 0.29, "grad_norm": 1.7062394043565066, "learning_rate": 0.0008314811321397134, "loss": 4.1356, "step": 2546 }, { "epoch": 0.29, "grad_norm": 1.286823314554175, "learning_rate": 0.0008313420879923456, "loss": 4.0607, "step": 2547 }, { "epoch": 0.29, "grad_norm": 0.9395952111822019, "learning_rate": 0.0008312029981426528, "loss": 4.0576, "step": 2548 }, { "epoch": 0.29, "grad_norm": 1.2849928535868436, "learning_rate": 0.0008310638626098196, "loss": 4.1644, "step": 2549 }, { "epoch": 0.29, "grad_norm": 0.8273935190139345, "learning_rate": 0.0008309246814130372, "loss": 3.9023, "step": 2550 }, { "epoch": 0.29, "grad_norm": 1.1296806728586326, "learning_rate": 0.0008307854545715032, "loss": 4.0118, "step": 2551 }, { "epoch": 0.29, "grad_norm": 3.116146390046185, "learning_rate": 0.0008306461821044209, "loss": 4.057, "step": 2552 }, { "epoch": 0.29, "grad_norm": 0.8884048048254956, "learning_rate": 0.0008305068640310006, "loss": 4.1816, "step": 2553 }, { "epoch": 0.29, "grad_norm": 1.1722599900299973, "learning_rate": 0.0008303675003704583, "loss": 3.9607, "step": 2554 }, { "epoch": 0.29, "grad_norm": 1.5343900983404506, "learning_rate": 0.0008302280911420167, "loss": 3.931, "step": 2555 }, { "epoch": 0.29, "grad_norm": 3.449072479268191, "learning_rate": 0.0008300886363649048, "loss": 4.0846, "step": 2556 }, { "epoch": 0.29, "grad_norm": 1.3528564670293748, "learning_rate": 0.0008299491360583574, "loss": 4.1201, "step": 2557 }, { "epoch": 0.29, "grad_norm": 0.9876025561678319, "learning_rate": 0.000829809590241616, "loss": 4.0753, "step": 2558 }, { "epoch": 0.29, "grad_norm": 1.1713012517351493, "learning_rate": 0.0008296699989339287, "loss": 3.9909, "step": 2559 }, { "epoch": 0.29, "grad_norm": 0.8678738828826799, "learning_rate": 0.000829530362154549, "loss": 3.9026, "step": 2560 }, { "epoch": 0.29, "grad_norm": 1.146674005814055, "learning_rate": 0.0008293906799227371, "loss": 4.154, "step": 2561 }, { "epoch": 0.29, "grad_norm": 1.0867525610056523, "learning_rate": 0.0008292509522577599, "loss": 4.047, "step": 2562 }, { "epoch": 0.29, "grad_norm": 0.8984386289933354, "learning_rate": 0.0008291111791788897, "loss": 3.759, "step": 2563 }, { "epoch": 0.29, "grad_norm": 0.7752699834416784, "learning_rate": 0.0008289713607054059, "loss": 3.9296, "step": 2564 }, { "epoch": 0.29, "grad_norm": 1.0835593043240626, "learning_rate": 0.0008288314968565938, "loss": 3.9165, "step": 2565 }, { "epoch": 0.29, "grad_norm": 0.8460982724193011, "learning_rate": 0.0008286915876517444, "loss": 3.9664, "step": 2566 }, { "epoch": 0.29, "grad_norm": 0.9202712326134165, "learning_rate": 0.0008285516331101563, "loss": 4.1798, "step": 2567 }, { "epoch": 0.29, "grad_norm": 1.7367923425268394, "learning_rate": 0.0008284116332511329, "loss": 3.928, "step": 2568 }, { "epoch": 0.29, "grad_norm": 0.9139348712906511, "learning_rate": 0.0008282715880939851, "loss": 3.9642, "step": 2569 }, { "epoch": 0.29, "grad_norm": 1.0189794504790166, "learning_rate": 0.0008281314976580289, "loss": 3.9946, "step": 2570 }, { "epoch": 0.29, "grad_norm": 1.7380862460558033, "learning_rate": 0.0008279913619625874, "loss": 3.9197, "step": 2571 }, { "epoch": 0.29, "grad_norm": 0.908838601125907, "learning_rate": 0.0008278511810269896, "loss": 3.7409, "step": 2572 }, { "epoch": 0.3, "grad_norm": 1.39895281692051, "learning_rate": 0.0008277109548705708, "loss": 3.9685, "step": 2573 }, { "epoch": 0.3, "grad_norm": 1.0338103084125427, "learning_rate": 0.0008275706835126726, "loss": 4.1318, "step": 2574 }, { "epoch": 0.3, "grad_norm": 1.2010232738799789, "learning_rate": 0.0008274303669726426, "loss": 3.9996, "step": 2575 }, { "epoch": 0.3, "grad_norm": 1.0061090021987664, "learning_rate": 0.0008272900052698349, "loss": 4.0581, "step": 2576 }, { "epoch": 0.3, "grad_norm": 6.81969984977465, "learning_rate": 0.0008271495984236096, "loss": 3.9465, "step": 2577 }, { "epoch": 0.3, "grad_norm": 2.233950530582823, "learning_rate": 0.0008270091464533333, "loss": 3.9404, "step": 2578 }, { "epoch": 0.3, "grad_norm": 0.8242090681200007, "learning_rate": 0.0008268686493783786, "loss": 4.024, "step": 2579 }, { "epoch": 0.3, "grad_norm": 1.2927024121145894, "learning_rate": 0.0008267281072181245, "loss": 3.709, "step": 2580 }, { "epoch": 0.3, "grad_norm": 0.8384648735333072, "learning_rate": 0.0008265875199919558, "loss": 3.923, "step": 2581 }, { "epoch": 0.3, "grad_norm": 0.8187021447095653, "learning_rate": 0.0008264468877192641, "loss": 3.9462, "step": 2582 }, { "epoch": 0.3, "grad_norm": 2.0560729040420753, "learning_rate": 0.000826306210419447, "loss": 4.0261, "step": 2583 }, { "epoch": 0.3, "grad_norm": 2.9200416712061856, "learning_rate": 0.0008261654881119081, "loss": 4.1381, "step": 2584 }, { "epoch": 0.3, "grad_norm": 1.0027495810033435, "learning_rate": 0.0008260247208160574, "loss": 4.0858, "step": 2585 }, { "epoch": 0.3, "grad_norm": 1.2109065343055452, "learning_rate": 0.000825883908551311, "loss": 3.9047, "step": 2586 }, { "epoch": 0.3, "grad_norm": 1.0623503858438932, "learning_rate": 0.0008257430513370914, "loss": 3.9548, "step": 2587 }, { "epoch": 0.3, "grad_norm": 0.8513165471115364, "learning_rate": 0.000825602149192827, "loss": 3.9472, "step": 2588 }, { "epoch": 0.3, "grad_norm": 1.124570625746105, "learning_rate": 0.0008254612021379526, "loss": 3.7108, "step": 2589 }, { "epoch": 0.3, "grad_norm": 2.718501884444714, "learning_rate": 0.0008253202101919095, "loss": 3.9822, "step": 2590 }, { "epoch": 0.3, "grad_norm": 1.1797658906139883, "learning_rate": 0.0008251791733741442, "loss": 4.1089, "step": 2591 }, { "epoch": 0.3, "grad_norm": 0.9757476272258978, "learning_rate": 0.0008250380917041107, "loss": 3.8308, "step": 2592 }, { "epoch": 0.3, "grad_norm": 0.9731654048656984, "learning_rate": 0.0008248969652012681, "loss": 3.9848, "step": 2593 }, { "epoch": 0.3, "grad_norm": 1.212247727456887, "learning_rate": 0.0008247557938850824, "loss": 4.0108, "step": 2594 }, { "epoch": 0.3, "grad_norm": 1.809357334835167, "learning_rate": 0.0008246145777750253, "loss": 4.0185, "step": 2595 }, { "epoch": 0.3, "grad_norm": 1.1789261097315042, "learning_rate": 0.0008244733168905748, "loss": 3.9507, "step": 2596 }, { "epoch": 0.3, "grad_norm": 1.2838483741650433, "learning_rate": 0.0008243320112512153, "loss": 3.8855, "step": 2597 }, { "epoch": 0.3, "grad_norm": 0.831548004670212, "learning_rate": 0.0008241906608764373, "loss": 3.9904, "step": 2598 }, { "epoch": 0.3, "grad_norm": 0.9380287575030463, "learning_rate": 0.000824049265785737, "loss": 4.2456, "step": 2599 }, { "epoch": 0.3, "grad_norm": 0.942563872203756, "learning_rate": 0.0008239078259986177, "loss": 4.0264, "step": 2600 }, { "epoch": 0.3, "grad_norm": 0.9314283965469843, "learning_rate": 0.0008237663415345879, "loss": 3.9434, "step": 2601 }, { "epoch": 0.3, "grad_norm": 0.9804728355260155, "learning_rate": 0.0008236248124131629, "loss": 4.0881, "step": 2602 }, { "epoch": 0.3, "grad_norm": 1.1561713732092702, "learning_rate": 0.0008234832386538639, "loss": 4.0186, "step": 2603 }, { "epoch": 0.3, "grad_norm": 0.905473026737114, "learning_rate": 0.0008233416202762182, "loss": 3.8069, "step": 2604 }, { "epoch": 0.3, "grad_norm": 0.9710196096645655, "learning_rate": 0.0008231999572997595, "loss": 4.0598, "step": 2605 }, { "epoch": 0.3, "grad_norm": 0.8737178969696513, "learning_rate": 0.0008230582497440273, "loss": 4.0411, "step": 2606 }, { "epoch": 0.3, "grad_norm": 0.9975384041982325, "learning_rate": 0.0008229164976285678, "loss": 3.7671, "step": 2607 }, { "epoch": 0.3, "grad_norm": 0.9512379649303974, "learning_rate": 0.0008227747009729327, "loss": 4.1249, "step": 2608 }, { "epoch": 0.3, "grad_norm": 1.1278532138032489, "learning_rate": 0.0008226328597966803, "loss": 3.909, "step": 2609 }, { "epoch": 0.3, "grad_norm": 0.9707989529667285, "learning_rate": 0.0008224909741193747, "loss": 4.1197, "step": 2610 }, { "epoch": 0.3, "grad_norm": 0.8868665519306765, "learning_rate": 0.0008223490439605865, "loss": 3.8661, "step": 2611 }, { "epoch": 0.3, "grad_norm": 1.0871150144750616, "learning_rate": 0.0008222070693398924, "loss": 4.2094, "step": 2612 }, { "epoch": 0.3, "grad_norm": 1.0672470254965396, "learning_rate": 0.0008220650502768748, "loss": 4.3, "step": 2613 }, { "epoch": 0.3, "grad_norm": 0.8688864408260192, "learning_rate": 0.0008219229867911224, "loss": 3.9348, "step": 2614 }, { "epoch": 0.3, "grad_norm": 0.946347891477119, "learning_rate": 0.0008217808789022308, "loss": 3.8858, "step": 2615 }, { "epoch": 0.3, "grad_norm": 0.9882213495056921, "learning_rate": 0.0008216387266298004, "loss": 3.7753, "step": 2616 }, { "epoch": 0.3, "grad_norm": 2.5586875775522584, "learning_rate": 0.0008214965299934386, "loss": 3.9684, "step": 2617 }, { "epoch": 0.3, "grad_norm": 1.1760630710787108, "learning_rate": 0.0008213542890127589, "loss": 4.0723, "step": 2618 }, { "epoch": 0.3, "grad_norm": 1.527485728527442, "learning_rate": 0.0008212120037073805, "loss": 3.9514, "step": 2619 }, { "epoch": 0.3, "grad_norm": 1.000455822045798, "learning_rate": 0.0008210696740969292, "loss": 3.9155, "step": 2620 }, { "epoch": 0.3, "grad_norm": 0.9398738547975005, "learning_rate": 0.0008209273002010364, "loss": 4.1384, "step": 2621 }, { "epoch": 0.3, "grad_norm": 1.1799689579829928, "learning_rate": 0.00082078488203934, "loss": 4.0782, "step": 2622 }, { "epoch": 0.3, "grad_norm": 0.7831526519403089, "learning_rate": 0.0008206424196314838, "loss": 3.9218, "step": 2623 }, { "epoch": 0.3, "grad_norm": 1.1081447382112273, "learning_rate": 0.0008204999129971178, "loss": 4.0611, "step": 2624 }, { "epoch": 0.3, "grad_norm": 0.9156541916471225, "learning_rate": 0.0008203573621558982, "loss": 3.8323, "step": 2625 }, { "epoch": 0.3, "grad_norm": 1.1775130121741613, "learning_rate": 0.0008202147671274869, "loss": 3.9728, "step": 2626 }, { "epoch": 0.3, "grad_norm": 0.9927684793345399, "learning_rate": 0.0008200721279315524, "loss": 3.9875, "step": 2627 }, { "epoch": 0.3, "grad_norm": 1.4090702178407455, "learning_rate": 0.000819929444587769, "loss": 3.9967, "step": 2628 }, { "epoch": 0.3, "grad_norm": 1.2832027678029638, "learning_rate": 0.0008197867171158171, "loss": 3.9674, "step": 2629 }, { "epoch": 0.3, "grad_norm": 1.266591138902616, "learning_rate": 0.0008196439455353833, "loss": 4.1595, "step": 2630 }, { "epoch": 0.3, "grad_norm": 0.8336229397403966, "learning_rate": 0.0008195011298661601, "loss": 4.0009, "step": 2631 }, { "epoch": 0.3, "grad_norm": 1.6780383601359878, "learning_rate": 0.0008193582701278464, "loss": 3.7572, "step": 2632 }, { "epoch": 0.3, "grad_norm": 3.589954759088504, "learning_rate": 0.0008192153663401467, "loss": 3.8619, "step": 2633 }, { "epoch": 0.3, "grad_norm": 1.092955797022132, "learning_rate": 0.0008190724185227722, "loss": 4.0658, "step": 2634 }, { "epoch": 0.3, "grad_norm": 0.8725548619147876, "learning_rate": 0.0008189294266954395, "loss": 3.8422, "step": 2635 }, { "epoch": 0.3, "grad_norm": 1.139634378036199, "learning_rate": 0.0008187863908778718, "loss": 3.6668, "step": 2636 }, { "epoch": 0.3, "grad_norm": 1.8279935064694217, "learning_rate": 0.0008186433110897982, "loss": 3.9912, "step": 2637 }, { "epoch": 0.3, "grad_norm": 0.9100648230331877, "learning_rate": 0.0008185001873509534, "loss": 4.1805, "step": 2638 }, { "epoch": 0.3, "grad_norm": 1.2178620567206633, "learning_rate": 0.0008183570196810793, "loss": 3.7777, "step": 2639 }, { "epoch": 0.3, "grad_norm": 1.0506663473696358, "learning_rate": 0.0008182138080999226, "loss": 3.8543, "step": 2640 }, { "epoch": 0.3, "grad_norm": 0.8160699816732707, "learning_rate": 0.0008180705526272368, "loss": 3.8549, "step": 2641 }, { "epoch": 0.3, "grad_norm": 2.1341299088009293, "learning_rate": 0.0008179272532827811, "loss": 3.9477, "step": 2642 }, { "epoch": 0.3, "grad_norm": 0.9655180923868404, "learning_rate": 0.0008177839100863212, "loss": 3.9003, "step": 2643 }, { "epoch": 0.3, "grad_norm": 1.0605041186086015, "learning_rate": 0.0008176405230576285, "loss": 3.8024, "step": 2644 }, { "epoch": 0.3, "grad_norm": 0.8093211900958527, "learning_rate": 0.0008174970922164803, "loss": 4.0604, "step": 2645 }, { "epoch": 0.3, "grad_norm": 0.9018127399315923, "learning_rate": 0.0008173536175826603, "loss": 4.0206, "step": 2646 }, { "epoch": 0.3, "grad_norm": 1.853591655096383, "learning_rate": 0.000817210099175958, "loss": 4.2233, "step": 2647 }, { "epoch": 0.3, "grad_norm": 1.8466987007955065, "learning_rate": 0.0008170665370161691, "loss": 3.7984, "step": 2648 }, { "epoch": 0.3, "grad_norm": 5.020744057069236, "learning_rate": 0.0008169229311230954, "loss": 4.1698, "step": 2649 }, { "epoch": 0.3, "grad_norm": 1.1400620280803884, "learning_rate": 0.0008167792815165444, "loss": 4.0694, "step": 2650 }, { "epoch": 0.3, "grad_norm": 0.7813691716646572, "learning_rate": 0.0008166355882163296, "loss": 4.0799, "step": 2651 }, { "epoch": 0.3, "grad_norm": 0.9741795860775472, "learning_rate": 0.0008164918512422715, "loss": 3.8073, "step": 2652 }, { "epoch": 0.3, "grad_norm": 0.8042988898446998, "learning_rate": 0.000816348070614195, "loss": 3.989, "step": 2653 }, { "epoch": 0.3, "grad_norm": 0.8120022705165385, "learning_rate": 0.0008162042463519326, "loss": 3.8042, "step": 2654 }, { "epoch": 0.3, "grad_norm": 0.9960644842956661, "learning_rate": 0.0008160603784753217, "loss": 4.0245, "step": 2655 }, { "epoch": 0.3, "grad_norm": 1.04242764389108, "learning_rate": 0.0008159164670042062, "loss": 4.1588, "step": 2656 }, { "epoch": 0.3, "grad_norm": 0.9395553526138622, "learning_rate": 0.000815772511958436, "loss": 3.9286, "step": 2657 }, { "epoch": 0.3, "grad_norm": 0.8794596600328727, "learning_rate": 0.000815628513357867, "loss": 4.1022, "step": 2658 }, { "epoch": 0.3, "grad_norm": 0.8622681181135726, "learning_rate": 0.000815484471222361, "loss": 4.1923, "step": 2659 }, { "epoch": 0.31, "grad_norm": 0.737248606503567, "learning_rate": 0.0008153403855717858, "loss": 3.8922, "step": 2660 }, { "epoch": 0.31, "grad_norm": 0.7620054188506213, "learning_rate": 0.0008151962564260153, "loss": 4.0185, "step": 2661 }, { "epoch": 0.31, "grad_norm": 0.8744543870813347, "learning_rate": 0.0008150520838049297, "loss": 4.1685, "step": 2662 }, { "epoch": 0.31, "grad_norm": 1.798135050662947, "learning_rate": 0.0008149078677284143, "loss": 3.8676, "step": 2663 }, { "epoch": 0.31, "grad_norm": 1.246322571777222, "learning_rate": 0.0008147636082163614, "loss": 4.0242, "step": 2664 }, { "epoch": 0.31, "grad_norm": 0.9807535555015304, "learning_rate": 0.0008146193052886685, "loss": 3.9485, "step": 2665 }, { "epoch": 0.31, "grad_norm": 0.8831071498721812, "learning_rate": 0.0008144749589652398, "loss": 4.0533, "step": 2666 }, { "epoch": 0.31, "grad_norm": 0.8362869473904423, "learning_rate": 0.0008143305692659849, "loss": 4.0876, "step": 2667 }, { "epoch": 0.31, "grad_norm": 0.7637473600696171, "learning_rate": 0.0008141861362108196, "loss": 3.7924, "step": 2668 }, { "epoch": 0.31, "grad_norm": 0.8891687880880162, "learning_rate": 0.0008140416598196659, "loss": 4.114, "step": 2669 }, { "epoch": 0.31, "grad_norm": 0.8175649020507004, "learning_rate": 0.0008138971401124513, "loss": 3.9467, "step": 2670 }, { "epoch": 0.31, "grad_norm": 0.8029612479848057, "learning_rate": 0.0008137525771091097, "loss": 4.04, "step": 2671 }, { "epoch": 0.31, "grad_norm": 6.701892247915068, "learning_rate": 0.0008136079708295807, "loss": 3.8627, "step": 2672 }, { "epoch": 0.31, "grad_norm": 0.786725404648604, "learning_rate": 0.00081346332129381, "loss": 3.9668, "step": 2673 }, { "epoch": 0.31, "grad_norm": 0.6795966756272106, "learning_rate": 0.0008133186285217493, "loss": 3.8951, "step": 2674 }, { "epoch": 0.31, "grad_norm": 0.9329360019619175, "learning_rate": 0.0008131738925333563, "loss": 4.2505, "step": 2675 }, { "epoch": 0.31, "grad_norm": 1.2069630875451651, "learning_rate": 0.0008130291133485943, "loss": 3.9715, "step": 2676 }, { "epoch": 0.31, "grad_norm": 1.514317905917099, "learning_rate": 0.000812884290987433, "loss": 3.9921, "step": 2677 }, { "epoch": 0.31, "grad_norm": 1.3355765120113674, "learning_rate": 0.0008127394254698479, "loss": 3.9256, "step": 2678 }, { "epoch": 0.31, "grad_norm": 3.946427368521438, "learning_rate": 0.0008125945168158205, "loss": 3.9987, "step": 2679 }, { "epoch": 0.31, "grad_norm": 0.9247215500681851, "learning_rate": 0.000812449565045338, "loss": 3.7882, "step": 2680 }, { "epoch": 0.31, "grad_norm": 0.7461805537372401, "learning_rate": 0.000812304570178394, "loss": 4.045, "step": 2681 }, { "epoch": 0.31, "grad_norm": 0.8998231282408888, "learning_rate": 0.0008121595322349875, "loss": 3.6986, "step": 2682 }, { "epoch": 0.31, "grad_norm": 0.7411691868067789, "learning_rate": 0.0008120144512351237, "loss": 3.9985, "step": 2683 }, { "epoch": 0.31, "grad_norm": 1.2851309991661914, "learning_rate": 0.0008118693271988142, "loss": 4.1687, "step": 2684 }, { "epoch": 0.31, "grad_norm": 1.9928121254751574, "learning_rate": 0.0008117241601460755, "loss": 4.0274, "step": 2685 }, { "epoch": 0.31, "grad_norm": 0.7973697878166337, "learning_rate": 0.0008115789500969309, "loss": 4.113, "step": 2686 }, { "epoch": 0.31, "grad_norm": 2.181756394412963, "learning_rate": 0.0008114336970714096, "loss": 3.828, "step": 2687 }, { "epoch": 0.31, "grad_norm": 0.8264191683187341, "learning_rate": 0.0008112884010895461, "loss": 4.0092, "step": 2688 }, { "epoch": 0.31, "grad_norm": 5.551551726023364, "learning_rate": 0.0008111430621713814, "loss": 3.9793, "step": 2689 }, { "epoch": 0.31, "grad_norm": 0.9193404356530556, "learning_rate": 0.0008109976803369623, "loss": 3.9543, "step": 2690 }, { "epoch": 0.31, "grad_norm": 0.7817832365908034, "learning_rate": 0.0008108522556063411, "loss": 4.0588, "step": 2691 }, { "epoch": 0.31, "grad_norm": 0.7894552713648364, "learning_rate": 0.0008107067879995768, "loss": 3.9162, "step": 2692 }, { "epoch": 0.31, "grad_norm": 0.6956521510483695, "learning_rate": 0.0008105612775367337, "loss": 4.0317, "step": 2693 }, { "epoch": 0.31, "grad_norm": 2.361514723884876, "learning_rate": 0.0008104157242378821, "loss": 3.9863, "step": 2694 }, { "epoch": 0.31, "grad_norm": 0.8893211503911383, "learning_rate": 0.0008102701281230985, "loss": 3.9058, "step": 2695 }, { "epoch": 0.31, "grad_norm": 0.7730194816694382, "learning_rate": 0.0008101244892124651, "loss": 3.8888, "step": 2696 }, { "epoch": 0.31, "grad_norm": 1.1426273871139703, "learning_rate": 0.0008099788075260698, "loss": 4.0163, "step": 2697 }, { "epoch": 0.31, "grad_norm": 0.9356801484356292, "learning_rate": 0.0008098330830840066, "loss": 3.8302, "step": 2698 }, { "epoch": 0.31, "grad_norm": 0.7327268439247038, "learning_rate": 0.0008096873159063758, "loss": 4.1242, "step": 2699 }, { "epoch": 0.31, "grad_norm": 0.9152789686394206, "learning_rate": 0.0008095415060132829, "loss": 4.0226, "step": 2700 }, { "epoch": 0.31, "grad_norm": 1.7663824783710986, "learning_rate": 0.0008093956534248395, "loss": 3.908, "step": 2701 }, { "epoch": 0.31, "grad_norm": 0.6919150469394324, "learning_rate": 0.0008092497581611636, "loss": 4.1373, "step": 2702 }, { "epoch": 0.31, "grad_norm": 1.193592339375226, "learning_rate": 0.0008091038202423781, "loss": 3.9508, "step": 2703 }, { "epoch": 0.31, "grad_norm": 0.8266971012702847, "learning_rate": 0.0008089578396886128, "loss": 4.02, "step": 2704 }, { "epoch": 0.31, "grad_norm": 0.855863382263066, "learning_rate": 0.0008088118165200026, "loss": 3.8974, "step": 2705 }, { "epoch": 0.31, "grad_norm": 0.8891708616356692, "learning_rate": 0.000808665750756689, "loss": 3.9948, "step": 2706 }, { "epoch": 0.31, "grad_norm": 0.7858234188028291, "learning_rate": 0.0008085196424188188, "loss": 3.8328, "step": 2707 }, { "epoch": 0.31, "grad_norm": 1.7345366627956063, "learning_rate": 0.0008083734915265448, "loss": 3.8908, "step": 2708 }, { "epoch": 0.31, "grad_norm": 1.0966729050786763, "learning_rate": 0.0008082272981000258, "loss": 3.8981, "step": 2709 }, { "epoch": 0.31, "grad_norm": 1.5966845923277222, "learning_rate": 0.0008080810621594264, "loss": 3.8724, "step": 2710 }, { "epoch": 0.31, "grad_norm": 0.9025818854045417, "learning_rate": 0.0008079347837249168, "loss": 3.7856, "step": 2711 }, { "epoch": 0.31, "grad_norm": 0.8548603635305055, "learning_rate": 0.0008077884628166738, "loss": 3.8799, "step": 2712 }, { "epoch": 0.31, "grad_norm": 1.4535105024443078, "learning_rate": 0.0008076420994548792, "loss": 3.749, "step": 2713 }, { "epoch": 0.31, "grad_norm": 0.843996843832999, "learning_rate": 0.0008074956936597213, "loss": 3.8232, "step": 2714 }, { "epoch": 0.31, "grad_norm": 1.7650181474276436, "learning_rate": 0.0008073492454513938, "loss": 3.7984, "step": 2715 }, { "epoch": 0.31, "grad_norm": 0.770349245764047, "learning_rate": 0.0008072027548500964, "loss": 3.9253, "step": 2716 }, { "epoch": 0.31, "grad_norm": 1.2278525450191942, "learning_rate": 0.0008070562218760349, "loss": 4.2264, "step": 2717 }, { "epoch": 0.31, "grad_norm": 2.1023085351614266, "learning_rate": 0.0008069096465494205, "loss": 3.9964, "step": 2718 }, { "epoch": 0.31, "grad_norm": 1.1885882708686453, "learning_rate": 0.0008067630288904708, "loss": 3.7444, "step": 2719 }, { "epoch": 0.31, "grad_norm": 1.156561158637329, "learning_rate": 0.0008066163689194086, "loss": 3.9443, "step": 2720 }, { "epoch": 0.31, "grad_norm": 1.4274981881145583, "learning_rate": 0.0008064696666564631, "loss": 3.6603, "step": 2721 }, { "epoch": 0.31, "grad_norm": 0.9303193374404456, "learning_rate": 0.0008063229221218686, "loss": 4.0276, "step": 2722 }, { "epoch": 0.31, "grad_norm": 0.984468562184463, "learning_rate": 0.0008061761353358663, "loss": 3.8234, "step": 2723 }, { "epoch": 0.31, "grad_norm": 1.724415564781025, "learning_rate": 0.0008060293063187023, "loss": 4.0003, "step": 2724 }, { "epoch": 0.31, "grad_norm": 1.3183493645017315, "learning_rate": 0.0008058824350906288, "loss": 3.9011, "step": 2725 }, { "epoch": 0.31, "grad_norm": 0.7525736780809323, "learning_rate": 0.0008057355216719043, "loss": 4.073, "step": 2726 }, { "epoch": 0.31, "grad_norm": 0.839024534816327, "learning_rate": 0.0008055885660827922, "loss": 3.8572, "step": 2727 }, { "epoch": 0.31, "grad_norm": 0.8378287700641947, "learning_rate": 0.0008054415683435625, "loss": 4.0, "step": 2728 }, { "epoch": 0.31, "grad_norm": 3.383317394066838, "learning_rate": 0.0008052945284744909, "loss": 3.9389, "step": 2729 }, { "epoch": 0.31, "grad_norm": 0.8255216538629481, "learning_rate": 0.0008051474464958584, "loss": 3.9694, "step": 2730 }, { "epoch": 0.31, "grad_norm": 3.0495844883494896, "learning_rate": 0.0008050003224279521, "loss": 3.9245, "step": 2731 }, { "epoch": 0.31, "grad_norm": 0.910364997374101, "learning_rate": 0.0008048531562910655, "loss": 3.9877, "step": 2732 }, { "epoch": 0.31, "grad_norm": 0.7055365917159754, "learning_rate": 0.0008047059481054967, "loss": 3.9334, "step": 2733 }, { "epoch": 0.31, "grad_norm": 0.7776124641249378, "learning_rate": 0.0008045586978915508, "loss": 3.7157, "step": 2734 }, { "epoch": 0.31, "grad_norm": 0.9437011624833566, "learning_rate": 0.0008044114056695379, "loss": 3.8958, "step": 2735 }, { "epoch": 0.31, "grad_norm": 0.8115444489343417, "learning_rate": 0.0008042640714597741, "loss": 4.0245, "step": 2736 }, { "epoch": 0.31, "grad_norm": 0.9959770587820759, "learning_rate": 0.0008041166952825816, "loss": 3.9501, "step": 2737 }, { "epoch": 0.31, "grad_norm": 1.3162954427766302, "learning_rate": 0.0008039692771582878, "loss": 4.0346, "step": 2738 }, { "epoch": 0.31, "grad_norm": 1.0687043737599595, "learning_rate": 0.0008038218171072264, "loss": 4.0225, "step": 2739 }, { "epoch": 0.31, "grad_norm": 1.0271528840880164, "learning_rate": 0.000803674315149737, "loss": 4.15, "step": 2740 }, { "epoch": 0.31, "grad_norm": 1.4232307809370077, "learning_rate": 0.0008035267713061641, "loss": 3.828, "step": 2741 }, { "epoch": 0.31, "grad_norm": 1.298153412066498, "learning_rate": 0.0008033791855968589, "loss": 4.035, "step": 2742 }, { "epoch": 0.31, "grad_norm": 0.8153973151222473, "learning_rate": 0.000803231558042178, "loss": 3.8755, "step": 2743 }, { "epoch": 0.31, "grad_norm": 1.0518388601437985, "learning_rate": 0.0008030838886624838, "loss": 3.9853, "step": 2744 }, { "epoch": 0.31, "grad_norm": 0.8753118081494248, "learning_rate": 0.0008029361774781446, "loss": 4.1503, "step": 2745 }, { "epoch": 0.31, "grad_norm": 0.8214645913444754, "learning_rate": 0.0008027884245095338, "loss": 3.8503, "step": 2746 }, { "epoch": 0.31, "grad_norm": 1.6537639225325307, "learning_rate": 0.0008026406297770319, "loss": 3.7893, "step": 2747 }, { "epoch": 0.32, "grad_norm": 0.9764862412683216, "learning_rate": 0.0008024927933010237, "loss": 3.9725, "step": 2748 }, { "epoch": 0.32, "grad_norm": 1.4144769729813351, "learning_rate": 0.0008023449151019009, "loss": 3.708, "step": 2749 }, { "epoch": 0.32, "grad_norm": 0.7801247235992312, "learning_rate": 0.0008021969952000603, "loss": 3.9234, "step": 2750 }, { "epoch": 0.32, "grad_norm": 0.8174139699126767, "learning_rate": 0.0008020490336159045, "loss": 3.8401, "step": 2751 }, { "epoch": 0.32, "grad_norm": 2.0895947086790296, "learning_rate": 0.0008019010303698422, "loss": 3.9122, "step": 2752 }, { "epoch": 0.32, "grad_norm": 1.5098707116208527, "learning_rate": 0.0008017529854822873, "loss": 3.9278, "step": 2753 }, { "epoch": 0.32, "grad_norm": 0.7694856417029841, "learning_rate": 0.0008016048989736602, "loss": 4.0884, "step": 2754 }, { "epoch": 0.32, "grad_norm": 1.015367727779244, "learning_rate": 0.0008014567708643864, "loss": 3.9073, "step": 2755 }, { "epoch": 0.32, "grad_norm": 3.3400865420067793, "learning_rate": 0.0008013086011748972, "loss": 3.874, "step": 2756 }, { "epoch": 0.32, "grad_norm": 1.4854843850094446, "learning_rate": 0.0008011603899256301, "loss": 3.8163, "step": 2757 }, { "epoch": 0.32, "grad_norm": 1.001445337313814, "learning_rate": 0.0008010121371370277, "loss": 3.827, "step": 2758 }, { "epoch": 0.32, "grad_norm": 1.1614708736615003, "learning_rate": 0.000800863842829539, "loss": 4.0865, "step": 2759 }, { "epoch": 0.32, "grad_norm": 2.3134492720212636, "learning_rate": 0.000800715507023618, "loss": 3.8828, "step": 2760 }, { "epoch": 0.32, "grad_norm": 1.2415840209231628, "learning_rate": 0.0008005671297397248, "loss": 3.9334, "step": 2761 }, { "epoch": 0.32, "grad_norm": 1.0071668257835977, "learning_rate": 0.0008004187109983257, "loss": 3.9424, "step": 2762 }, { "epoch": 0.32, "grad_norm": 1.0431236814651308, "learning_rate": 0.0008002702508198918, "loss": 3.9317, "step": 2763 }, { "epoch": 0.32, "grad_norm": 1.1548685612456089, "learning_rate": 0.0008001217492249004, "loss": 3.8931, "step": 2764 }, { "epoch": 0.32, "grad_norm": 2.1400881227344546, "learning_rate": 0.0007999732062338347, "loss": 4.1062, "step": 2765 }, { "epoch": 0.32, "grad_norm": 11.337640077853482, "learning_rate": 0.0007998246218671829, "loss": 4.2695, "step": 2766 }, { "epoch": 0.32, "grad_norm": 12.618338066178993, "learning_rate": 0.00079967599614544, "loss": 3.9476, "step": 2767 }, { "epoch": 0.32, "grad_norm": 7.53396852411618, "learning_rate": 0.0007995273290891056, "loss": 4.4291, "step": 2768 }, { "epoch": 0.32, "grad_norm": 6.776767118939428, "learning_rate": 0.0007993786207186859, "loss": 4.1048, "step": 2769 }, { "epoch": 0.32, "grad_norm": 3.736886199818959, "learning_rate": 0.000799229871054692, "loss": 4.5547, "step": 2770 }, { "epoch": 0.32, "grad_norm": 24.829950642012765, "learning_rate": 0.0007990810801176411, "loss": 4.0944, "step": 2771 }, { "epoch": 0.32, "grad_norm": 1.3872045246343345, "learning_rate": 0.0007989322479280564, "loss": 4.19, "step": 2772 }, { "epoch": 0.32, "grad_norm": 6.38622344994003, "learning_rate": 0.000798783374506466, "loss": 4.008, "step": 2773 }, { "epoch": 0.32, "grad_norm": 1.9717269342907684, "learning_rate": 0.0007986344598734048, "loss": 3.6167, "step": 2774 }, { "epoch": 0.32, "grad_norm": 1.740756974131187, "learning_rate": 0.0007984855040494122, "loss": 4.1897, "step": 2775 }, { "epoch": 0.32, "grad_norm": 2.3730169055599, "learning_rate": 0.0007983365070550339, "loss": 4.0664, "step": 2776 }, { "epoch": 0.32, "grad_norm": 1.0894725095659732, "learning_rate": 0.0007981874689108213, "loss": 4.1646, "step": 2777 }, { "epoch": 0.32, "grad_norm": 0.9693855657529189, "learning_rate": 0.0007980383896373312, "loss": 4.0332, "step": 2778 }, { "epoch": 0.32, "grad_norm": 0.9972728919008723, "learning_rate": 0.0007978892692551265, "loss": 4.1063, "step": 2779 }, { "epoch": 0.32, "grad_norm": 1.6985739879146275, "learning_rate": 0.0007977401077847755, "loss": 4.3329, "step": 2780 }, { "epoch": 0.32, "grad_norm": 1.196353881043433, "learning_rate": 0.0007975909052468518, "loss": 3.8156, "step": 2781 }, { "epoch": 0.32, "grad_norm": 1.3176068178173757, "learning_rate": 0.0007974416616619355, "loss": 4.2657, "step": 2782 }, { "epoch": 0.32, "grad_norm": 1.1162736221386216, "learning_rate": 0.0007972923770506118, "loss": 4.1117, "step": 2783 }, { "epoch": 0.32, "grad_norm": 1.0766690223076696, "learning_rate": 0.0007971430514334715, "loss": 4.1471, "step": 2784 }, { "epoch": 0.32, "grad_norm": 1.0272308071446519, "learning_rate": 0.0007969936848311113, "loss": 4.1441, "step": 2785 }, { "epoch": 0.32, "grad_norm": 1.0611691610932632, "learning_rate": 0.0007968442772641334, "loss": 3.9282, "step": 2786 }, { "epoch": 0.32, "grad_norm": 1.1640315301377098, "learning_rate": 0.000796694828753146, "loss": 4.0793, "step": 2787 }, { "epoch": 0.32, "grad_norm": 1.0873218494699712, "learning_rate": 0.0007965453393187624, "loss": 4.0915, "step": 2788 }, { "epoch": 0.32, "grad_norm": 1.3229250863916853, "learning_rate": 0.000796395808981602, "loss": 4.1144, "step": 2789 }, { "epoch": 0.32, "grad_norm": 0.9354102898142259, "learning_rate": 0.0007962462377622895, "loss": 3.9244, "step": 2790 }, { "epoch": 0.32, "grad_norm": 1.4761852456927063, "learning_rate": 0.0007960966256814555, "loss": 3.928, "step": 2791 }, { "epoch": 0.32, "grad_norm": 1.2160384451041661, "learning_rate": 0.0007959469727597359, "loss": 3.9146, "step": 2792 }, { "epoch": 0.32, "grad_norm": 1.124950976116157, "learning_rate": 0.0007957972790177729, "loss": 4.3564, "step": 2793 }, { "epoch": 0.32, "grad_norm": 1.1755145952312769, "learning_rate": 0.0007956475444762137, "loss": 4.1498, "step": 2794 }, { "epoch": 0.32, "grad_norm": 1.1457034615388906, "learning_rate": 0.0007954977691557112, "loss": 4.2814, "step": 2795 }, { "epoch": 0.32, "grad_norm": 0.8595320080185206, "learning_rate": 0.0007953479530769241, "loss": 4.0852, "step": 2796 }, { "epoch": 0.32, "grad_norm": 0.9699610888106223, "learning_rate": 0.0007951980962605168, "loss": 4.2344, "step": 2797 }, { "epoch": 0.32, "grad_norm": 1.1069418622905705, "learning_rate": 0.000795048198727159, "loss": 4.2064, "step": 2798 }, { "epoch": 0.32, "grad_norm": 0.9189453939231642, "learning_rate": 0.0007948982604975264, "loss": 4.0935, "step": 2799 }, { "epoch": 0.32, "grad_norm": 0.8255042619363008, "learning_rate": 0.0007947482815923001, "loss": 4.0399, "step": 2800 }, { "epoch": 0.32, "grad_norm": 0.8549423520150055, "learning_rate": 0.0007945982620321666, "loss": 4.1038, "step": 2801 }, { "epoch": 0.32, "grad_norm": 0.9090882450955426, "learning_rate": 0.0007944482018378185, "loss": 4.2242, "step": 2802 }, { "epoch": 0.32, "grad_norm": 0.8307630436650605, "learning_rate": 0.0007942981010299537, "loss": 4.004, "step": 2803 }, { "epoch": 0.32, "grad_norm": 0.8442861955631671, "learning_rate": 0.0007941479596292756, "loss": 4.0083, "step": 2804 }, { "epoch": 0.32, "grad_norm": 1.2905937742439755, "learning_rate": 0.0007939977776564935, "loss": 3.9092, "step": 2805 }, { "epoch": 0.32, "grad_norm": 1.1884108585856277, "learning_rate": 0.0007938475551323221, "loss": 4.1534, "step": 2806 }, { "epoch": 0.32, "grad_norm": 0.8482706300514102, "learning_rate": 0.0007936972920774817, "loss": 3.9782, "step": 2807 }, { "epoch": 0.32, "grad_norm": 0.921025044828864, "learning_rate": 0.000793546988512698, "loss": 3.9158, "step": 2808 }, { "epoch": 0.32, "grad_norm": 0.7309716388079802, "learning_rate": 0.0007933966444587031, "loss": 3.9478, "step": 2809 }, { "epoch": 0.32, "grad_norm": 0.8009003283423011, "learning_rate": 0.0007932462599362335, "loss": 4.1046, "step": 2810 }, { "epoch": 0.32, "grad_norm": 0.8175439860761643, "learning_rate": 0.0007930958349660323, "loss": 4.1633, "step": 2811 }, { "epoch": 0.32, "grad_norm": 0.9799068150624927, "learning_rate": 0.0007929453695688475, "loss": 4.0224, "step": 2812 }, { "epoch": 0.32, "grad_norm": 0.8581465847702718, "learning_rate": 0.000792794863765433, "loss": 3.8247, "step": 2813 }, { "epoch": 0.32, "grad_norm": 0.987953133543729, "learning_rate": 0.0007926443175765483, "loss": 3.7845, "step": 2814 }, { "epoch": 0.32, "grad_norm": 1.7484317241634189, "learning_rate": 0.0007924937310229583, "loss": 3.9374, "step": 2815 }, { "epoch": 0.32, "grad_norm": 0.9597348774576552, "learning_rate": 0.0007923431041254335, "loss": 4.0311, "step": 2816 }, { "epoch": 0.32, "grad_norm": 0.8819832900092995, "learning_rate": 0.00079219243690475, "loss": 4.1954, "step": 2817 }, { "epoch": 0.32, "grad_norm": 1.3312873771324396, "learning_rate": 0.0007920417293816895, "loss": 4.0808, "step": 2818 }, { "epoch": 0.32, "grad_norm": 0.9486595951566851, "learning_rate": 0.0007918909815770394, "loss": 4.1161, "step": 2819 }, { "epoch": 0.32, "grad_norm": 1.5339943011232848, "learning_rate": 0.0007917401935115923, "loss": 4.0167, "step": 2820 }, { "epoch": 0.32, "grad_norm": 0.7273956720276147, "learning_rate": 0.0007915893652061466, "loss": 3.9391, "step": 2821 }, { "epoch": 0.32, "grad_norm": 0.8444988142501272, "learning_rate": 0.000791438496681506, "loss": 3.8366, "step": 2822 }, { "epoch": 0.32, "grad_norm": 0.8309404377817179, "learning_rate": 0.0007912875879584802, "loss": 4.0385, "step": 2823 }, { "epoch": 0.32, "grad_norm": 0.8084936357939582, "learning_rate": 0.0007911366390578841, "loss": 4.0983, "step": 2824 }, { "epoch": 0.32, "grad_norm": 0.7208933777881864, "learning_rate": 0.0007909856500005382, "loss": 3.8361, "step": 2825 }, { "epoch": 0.32, "grad_norm": 1.0589500296206769, "learning_rate": 0.0007908346208072686, "loss": 3.9472, "step": 2826 }, { "epoch": 0.32, "grad_norm": 0.8808713173553334, "learning_rate": 0.0007906835514989068, "loss": 3.9511, "step": 2827 }, { "epoch": 0.32, "grad_norm": 1.177017682711171, "learning_rate": 0.0007905324420962901, "loss": 4.1075, "step": 2828 }, { "epoch": 0.32, "grad_norm": 1.0013931806231864, "learning_rate": 0.0007903812926202611, "loss": 4.0768, "step": 2829 }, { "epoch": 0.32, "grad_norm": 0.8313742335340177, "learning_rate": 0.0007902301030916679, "loss": 3.9747, "step": 2830 }, { "epoch": 0.32, "grad_norm": 0.8812477363620902, "learning_rate": 0.0007900788735313642, "loss": 4.1363, "step": 2831 }, { "epoch": 0.32, "grad_norm": 0.7879259854703444, "learning_rate": 0.0007899276039602094, "loss": 3.9426, "step": 2832 }, { "epoch": 0.32, "grad_norm": 1.009080635150116, "learning_rate": 0.000789776294399068, "loss": 3.8295, "step": 2833 }, { "epoch": 0.32, "grad_norm": 1.0445945856267995, "learning_rate": 0.0007896249448688106, "loss": 4.0023, "step": 2834 }, { "epoch": 0.33, "grad_norm": 0.8839699269635134, "learning_rate": 0.0007894735553903127, "loss": 3.8341, "step": 2835 }, { "epoch": 0.33, "grad_norm": 0.8987833429493101, "learning_rate": 0.0007893221259844558, "loss": 3.8845, "step": 2836 }, { "epoch": 0.33, "grad_norm": 1.108531523989193, "learning_rate": 0.0007891706566721266, "loss": 3.8923, "step": 2837 }, { "epoch": 0.33, "grad_norm": 2.337484662818593, "learning_rate": 0.0007890191474742173, "loss": 3.8549, "step": 2838 }, { "epoch": 0.33, "grad_norm": 0.905249238952922, "learning_rate": 0.0007888675984116258, "loss": 4.1272, "step": 2839 }, { "epoch": 0.33, "grad_norm": 0.9716787691132484, "learning_rate": 0.0007887160095052555, "loss": 4.122, "step": 2840 }, { "epoch": 0.33, "grad_norm": 1.0784125703615728, "learning_rate": 0.000788564380776015, "loss": 4.2529, "step": 2841 }, { "epoch": 0.33, "grad_norm": 1.0492332043340986, "learning_rate": 0.000788412712244819, "loss": 3.8131, "step": 2842 }, { "epoch": 0.33, "grad_norm": 0.797453373362816, "learning_rate": 0.0007882610039325867, "loss": 3.8867, "step": 2843 }, { "epoch": 0.33, "grad_norm": 1.4511602113446982, "learning_rate": 0.0007881092558602437, "loss": 4.0144, "step": 2844 }, { "epoch": 0.33, "grad_norm": 0.9589954246496639, "learning_rate": 0.0007879574680487209, "loss": 4.0077, "step": 2845 }, { "epoch": 0.33, "grad_norm": 0.8482062410721919, "learning_rate": 0.0007878056405189542, "loss": 4.0044, "step": 2846 }, { "epoch": 0.33, "grad_norm": 0.859823328179625, "learning_rate": 0.0007876537732918855, "loss": 4.1308, "step": 2847 }, { "epoch": 0.33, "grad_norm": 1.535989285295961, "learning_rate": 0.0007875018663884619, "loss": 4.2129, "step": 2848 }, { "epoch": 0.33, "grad_norm": 1.3269116512554076, "learning_rate": 0.0007873499198296361, "loss": 4.0823, "step": 2849 }, { "epoch": 0.33, "grad_norm": 0.9572307604809772, "learning_rate": 0.0007871979336363664, "loss": 4.1961, "step": 2850 }, { "epoch": 0.33, "grad_norm": 1.0964744326872842, "learning_rate": 0.0007870459078296162, "loss": 3.8502, "step": 2851 }, { "epoch": 0.33, "grad_norm": 0.9142853932481386, "learning_rate": 0.0007868938424303545, "loss": 3.728, "step": 2852 }, { "epoch": 0.33, "grad_norm": 1.3893872173599997, "learning_rate": 0.000786741737459556, "loss": 4.0, "step": 2853 }, { "epoch": 0.33, "grad_norm": 1.3653831983174414, "learning_rate": 0.0007865895929382007, "loss": 3.9563, "step": 2854 }, { "epoch": 0.33, "grad_norm": 1.063563152680983, "learning_rate": 0.0007864374088872739, "loss": 3.8091, "step": 2855 }, { "epoch": 0.33, "grad_norm": 0.767918483179044, "learning_rate": 0.0007862851853277664, "loss": 4.0228, "step": 2856 }, { "epoch": 0.33, "grad_norm": 0.979242933485631, "learning_rate": 0.0007861329222806748, "loss": 3.964, "step": 2857 }, { "epoch": 0.33, "grad_norm": 0.801838989467948, "learning_rate": 0.0007859806197670007, "loss": 4.0098, "step": 2858 }, { "epoch": 0.33, "grad_norm": 0.8374370064240751, "learning_rate": 0.0007858282778077513, "loss": 3.8464, "step": 2859 }, { "epoch": 0.33, "grad_norm": 0.9020804270901139, "learning_rate": 0.0007856758964239396, "loss": 4.0, "step": 2860 }, { "epoch": 0.33, "grad_norm": 1.336900654283856, "learning_rate": 0.0007855234756365832, "loss": 4.0716, "step": 2861 }, { "epoch": 0.33, "grad_norm": 1.0950203645504872, "learning_rate": 0.0007853710154667062, "loss": 3.9519, "step": 2862 }, { "epoch": 0.33, "grad_norm": 2.4160341334880693, "learning_rate": 0.0007852185159353371, "loss": 3.9708, "step": 2863 }, { "epoch": 0.33, "grad_norm": 0.8566448875930256, "learning_rate": 0.0007850659770635104, "loss": 3.8193, "step": 2864 }, { "epoch": 0.33, "grad_norm": 0.8144675512978782, "learning_rate": 0.0007849133988722663, "loss": 4.0091, "step": 2865 }, { "epoch": 0.33, "grad_norm": 1.0136090944940248, "learning_rate": 0.0007847607813826496, "loss": 4.0394, "step": 2866 }, { "epoch": 0.33, "grad_norm": 0.7332140394637517, "learning_rate": 0.0007846081246157111, "loss": 4.0508, "step": 2867 }, { "epoch": 0.33, "grad_norm": 0.9695306242566094, "learning_rate": 0.000784455428592507, "loss": 3.9823, "step": 2868 }, { "epoch": 0.33, "grad_norm": 0.9295287874574099, "learning_rate": 0.0007843026933340988, "loss": 3.9699, "step": 2869 }, { "epoch": 0.33, "grad_norm": 0.9540807473404618, "learning_rate": 0.0007841499188615533, "loss": 4.1949, "step": 2870 }, { "epoch": 0.33, "grad_norm": 1.0179788018150149, "learning_rate": 0.0007839971051959427, "loss": 4.2169, "step": 2871 }, { "epoch": 0.33, "grad_norm": 0.8285423784663146, "learning_rate": 0.0007838442523583451, "loss": 4.037, "step": 2872 }, { "epoch": 0.33, "grad_norm": 0.7579226558328955, "learning_rate": 0.0007836913603698434, "loss": 3.922, "step": 2873 }, { "epoch": 0.33, "grad_norm": 0.8848723203265536, "learning_rate": 0.0007835384292515263, "loss": 4.1685, "step": 2874 }, { "epoch": 0.33, "grad_norm": 0.7183387237293107, "learning_rate": 0.0007833854590244875, "loss": 3.9004, "step": 2875 }, { "epoch": 0.33, "grad_norm": 0.8684093472936824, "learning_rate": 0.0007832324497098266, "loss": 4.0238, "step": 2876 }, { "epoch": 0.33, "grad_norm": 0.8511111880213356, "learning_rate": 0.0007830794013286479, "loss": 3.866, "step": 2877 }, { "epoch": 0.33, "grad_norm": 0.7459336807973939, "learning_rate": 0.0007829263139020619, "loss": 4.0472, "step": 2878 }, { "epoch": 0.33, "grad_norm": 1.177667403122019, "learning_rate": 0.0007827731874511841, "loss": 3.8605, "step": 2879 }, { "epoch": 0.33, "grad_norm": 1.280660526982604, "learning_rate": 0.0007826200219971352, "loss": 3.8723, "step": 2880 }, { "epoch": 0.33, "grad_norm": 1.6069830766106687, "learning_rate": 0.0007824668175610412, "loss": 3.9719, "step": 2881 }, { "epoch": 0.33, "grad_norm": 0.9331988602725475, "learning_rate": 0.0007823135741640343, "loss": 3.9999, "step": 2882 }, { "epoch": 0.33, "grad_norm": 0.7337583099843121, "learning_rate": 0.0007821602918272512, "loss": 3.7754, "step": 2883 }, { "epoch": 0.33, "grad_norm": 0.8239459956750917, "learning_rate": 0.0007820069705718342, "loss": 4.0678, "step": 2884 }, { "epoch": 0.33, "grad_norm": 0.9761894435114252, "learning_rate": 0.0007818536104189313, "loss": 4.0467, "step": 2885 }, { "epoch": 0.33, "grad_norm": 0.839219441250696, "learning_rate": 0.0007817002113896954, "loss": 4.0039, "step": 2886 }, { "epoch": 0.33, "grad_norm": 2.2128003215972742, "learning_rate": 0.000781546773505285, "loss": 3.923, "step": 2887 }, { "epoch": 0.33, "grad_norm": 1.145746946953007, "learning_rate": 0.000781393296786864, "loss": 4.081, "step": 2888 }, { "epoch": 0.33, "grad_norm": 0.808918784357046, "learning_rate": 0.0007812397812556015, "loss": 4.0015, "step": 2889 }, { "epoch": 0.33, "grad_norm": 4.478961693506586, "learning_rate": 0.0007810862269326722, "loss": 3.9032, "step": 2890 }, { "epoch": 0.33, "grad_norm": 0.9487061826045086, "learning_rate": 0.0007809326338392557, "loss": 4.108, "step": 2891 }, { "epoch": 0.33, "grad_norm": 1.127412010516991, "learning_rate": 0.0007807790019965376, "loss": 4.0102, "step": 2892 }, { "epoch": 0.33, "grad_norm": 1.018244149242822, "learning_rate": 0.0007806253314257082, "loss": 4.0505, "step": 2893 }, { "epoch": 0.33, "grad_norm": 0.8422826212336879, "learning_rate": 0.0007804716221479637, "loss": 3.7124, "step": 2894 }, { "epoch": 0.33, "grad_norm": 0.7412520059699383, "learning_rate": 0.000780317874184505, "loss": 3.9786, "step": 2895 }, { "epoch": 0.33, "grad_norm": 0.8848019790691868, "learning_rate": 0.000780164087556539, "loss": 4.1179, "step": 2896 }, { "epoch": 0.33, "grad_norm": 1.8648892484853978, "learning_rate": 0.0007800102622852776, "loss": 4.079, "step": 2897 }, { "epoch": 0.33, "grad_norm": 0.875657876971824, "learning_rate": 0.0007798563983919379, "loss": 3.8901, "step": 2898 }, { "epoch": 0.33, "grad_norm": 0.734413586848325, "learning_rate": 0.0007797024958977425, "loss": 3.8855, "step": 2899 }, { "epoch": 0.33, "grad_norm": 0.8658701731632398, "learning_rate": 0.0007795485548239196, "loss": 4.0001, "step": 2900 }, { "epoch": 0.33, "grad_norm": 0.83000736467333, "learning_rate": 0.0007793945751917022, "loss": 3.9643, "step": 2901 }, { "epoch": 0.33, "grad_norm": 0.8446596205161383, "learning_rate": 0.0007792405570223289, "loss": 3.9056, "step": 2902 }, { "epoch": 0.33, "grad_norm": 0.9809116925008741, "learning_rate": 0.0007790865003370434, "loss": 4.0451, "step": 2903 }, { "epoch": 0.33, "grad_norm": 0.9033862237801152, "learning_rate": 0.0007789324051570951, "loss": 3.8639, "step": 2904 }, { "epoch": 0.33, "grad_norm": 0.7385336062110736, "learning_rate": 0.0007787782715037387, "loss": 4.0879, "step": 2905 }, { "epoch": 0.33, "grad_norm": 0.6948790109344255, "learning_rate": 0.0007786240993982335, "loss": 3.9518, "step": 2906 }, { "epoch": 0.33, "grad_norm": 0.9848021281883842, "learning_rate": 0.0007784698888618449, "loss": 4.2011, "step": 2907 }, { "epoch": 0.33, "grad_norm": 1.0429557034489951, "learning_rate": 0.0007783156399158433, "loss": 3.9976, "step": 2908 }, { "epoch": 0.33, "grad_norm": 0.9121365097746257, "learning_rate": 0.0007781613525815043, "loss": 3.943, "step": 2909 }, { "epoch": 0.33, "grad_norm": 0.7248504761260748, "learning_rate": 0.000778007026880109, "loss": 4.0865, "step": 2910 }, { "epoch": 0.33, "grad_norm": 0.8624252468760786, "learning_rate": 0.0007778526628329436, "loss": 3.9926, "step": 2911 }, { "epoch": 0.33, "grad_norm": 0.8242392021056203, "learning_rate": 0.0007776982604612996, "loss": 4.294, "step": 2912 }, { "epoch": 0.33, "grad_norm": 0.7163353324671676, "learning_rate": 0.0007775438197864743, "loss": 4.001, "step": 2913 }, { "epoch": 0.33, "grad_norm": 0.8131251111026169, "learning_rate": 0.0007773893408297692, "loss": 4.0809, "step": 2914 }, { "epoch": 0.33, "grad_norm": 0.7360229465696774, "learning_rate": 0.0007772348236124922, "loss": 3.8441, "step": 2915 }, { "epoch": 0.33, "grad_norm": 0.9937452280906106, "learning_rate": 0.0007770802681559558, "loss": 4.1574, "step": 2916 }, { "epoch": 0.33, "grad_norm": 0.7834225277860138, "learning_rate": 0.0007769256744814781, "loss": 3.8823, "step": 2917 }, { "epoch": 0.33, "grad_norm": 0.9196682266781429, "learning_rate": 0.0007767710426103822, "loss": 3.8536, "step": 2918 }, { "epoch": 0.33, "grad_norm": 0.853708026576579, "learning_rate": 0.0007766163725639967, "loss": 3.9578, "step": 2919 }, { "epoch": 0.33, "grad_norm": 0.7380675671111316, "learning_rate": 0.0007764616643636557, "loss": 4.0314, "step": 2920 }, { "epoch": 0.33, "grad_norm": 0.7135048241230377, "learning_rate": 0.0007763069180306976, "loss": 3.8473, "step": 2921 }, { "epoch": 0.34, "grad_norm": 0.9638937792142913, "learning_rate": 0.0007761521335864672, "loss": 3.9451, "step": 2922 }, { "epoch": 0.34, "grad_norm": 0.6767308111689807, "learning_rate": 0.0007759973110523137, "loss": 3.9316, "step": 2923 }, { "epoch": 0.34, "grad_norm": 0.7664839648043394, "learning_rate": 0.0007758424504495925, "loss": 3.8664, "step": 2924 }, { "epoch": 0.34, "grad_norm": 0.8468590461595321, "learning_rate": 0.000775687551799663, "loss": 3.9747, "step": 2925 }, { "epoch": 0.34, "grad_norm": 0.7743151988787623, "learning_rate": 0.0007755326151238908, "loss": 3.8021, "step": 2926 }, { "epoch": 0.34, "grad_norm": 0.7779232878040463, "learning_rate": 0.0007753776404436466, "loss": 3.9844, "step": 2927 }, { "epoch": 0.34, "grad_norm": 0.6810713586237054, "learning_rate": 0.000775222627780306, "loss": 3.8415, "step": 2928 }, { "epoch": 0.34, "grad_norm": 0.9660251821949293, "learning_rate": 0.0007750675771552502, "loss": 3.8987, "step": 2929 }, { "epoch": 0.34, "grad_norm": 0.7161658231716391, "learning_rate": 0.0007749124885898654, "loss": 4.0636, "step": 2930 }, { "epoch": 0.34, "grad_norm": 0.7572314985972287, "learning_rate": 0.0007747573621055431, "loss": 3.9436, "step": 2931 }, { "epoch": 0.34, "grad_norm": 0.7820828362781866, "learning_rate": 0.00077460219772368, "loss": 3.8519, "step": 2932 }, { "epoch": 0.34, "grad_norm": 0.8648705704087024, "learning_rate": 0.0007744469954656781, "loss": 3.9031, "step": 2933 }, { "epoch": 0.34, "grad_norm": 0.6170591665218305, "learning_rate": 0.0007742917553529447, "loss": 3.7378, "step": 2934 }, { "epoch": 0.34, "grad_norm": 0.9551509399427697, "learning_rate": 0.000774136477406892, "loss": 3.7949, "step": 2935 }, { "epoch": 0.34, "grad_norm": 0.7342213742401721, "learning_rate": 0.0007739811616489378, "loss": 4.0759, "step": 2936 }, { "epoch": 0.34, "grad_norm": 1.100306975324247, "learning_rate": 0.0007738258081005049, "loss": 3.9038, "step": 2937 }, { "epoch": 0.34, "grad_norm": 2.2289544862556783, "learning_rate": 0.0007736704167830216, "loss": 3.8585, "step": 2938 }, { "epoch": 0.34, "grad_norm": 0.8074985088783863, "learning_rate": 0.0007735149877179206, "loss": 3.689, "step": 2939 }, { "epoch": 0.34, "grad_norm": 0.7349632697798325, "learning_rate": 0.0007733595209266408, "loss": 3.7079, "step": 2940 }, { "epoch": 0.34, "grad_norm": 0.7858958297337546, "learning_rate": 0.000773204016430626, "loss": 3.8876, "step": 2941 }, { "epoch": 0.34, "grad_norm": 0.9231860919722755, "learning_rate": 0.0007730484742513247, "loss": 4.0616, "step": 2942 }, { "epoch": 0.34, "grad_norm": 0.6922810526825621, "learning_rate": 0.0007728928944101912, "loss": 3.7704, "step": 2943 }, { "epoch": 0.34, "grad_norm": 0.9142938950621817, "learning_rate": 0.0007727372769286846, "loss": 3.7842, "step": 2944 }, { "epoch": 0.34, "grad_norm": 0.7533169412240259, "learning_rate": 0.0007725816218282697, "loss": 3.9859, "step": 2945 }, { "epoch": 0.34, "grad_norm": 0.7724723882424335, "learning_rate": 0.0007724259291304159, "loss": 3.9933, "step": 2946 }, { "epoch": 0.34, "grad_norm": 0.7465063093057005, "learning_rate": 0.0007722701988565979, "loss": 4.0159, "step": 2947 }, { "epoch": 0.34, "grad_norm": 0.8088905627363308, "learning_rate": 0.0007721144310282961, "loss": 3.7711, "step": 2948 }, { "epoch": 0.34, "grad_norm": 1.111862314042159, "learning_rate": 0.0007719586256669955, "loss": 4.1872, "step": 2949 }, { "epoch": 0.34, "grad_norm": 2.161762487871592, "learning_rate": 0.0007718027827941865, "loss": 4.3064, "step": 2950 }, { "epoch": 0.34, "grad_norm": 0.7974502237458116, "learning_rate": 0.0007716469024313645, "loss": 4.08, "step": 2951 }, { "epoch": 0.34, "grad_norm": 1.085970878280513, "learning_rate": 0.0007714909846000304, "loss": 3.8732, "step": 2952 }, { "epoch": 0.34, "grad_norm": 1.8450186641935062, "learning_rate": 0.0007713350293216903, "loss": 3.8859, "step": 2953 }, { "epoch": 0.34, "grad_norm": 1.2704497736084017, "learning_rate": 0.0007711790366178548, "loss": 4.2942, "step": 2954 }, { "epoch": 0.34, "grad_norm": 0.9067371099823303, "learning_rate": 0.0007710230065100404, "loss": 4.051, "step": 2955 }, { "epoch": 0.34, "grad_norm": 1.2357457459662264, "learning_rate": 0.0007708669390197683, "loss": 3.7663, "step": 2956 }, { "epoch": 0.34, "grad_norm": 0.7590290861684584, "learning_rate": 0.0007707108341685654, "loss": 3.9191, "step": 2957 }, { "epoch": 0.34, "grad_norm": 0.8878677650418407, "learning_rate": 0.000770554691977963, "loss": 4.1141, "step": 2958 }, { "epoch": 0.34, "grad_norm": 0.770013801885147, "learning_rate": 0.0007703985124694981, "loss": 3.5428, "step": 2959 }, { "epoch": 0.34, "grad_norm": 1.0984254264251647, "learning_rate": 0.0007702422956647126, "loss": 4.0802, "step": 2960 }, { "epoch": 0.34, "grad_norm": 0.8026535163557686, "learning_rate": 0.0007700860415851538, "loss": 3.9167, "step": 2961 }, { "epoch": 0.34, "grad_norm": 0.7412435614588473, "learning_rate": 0.0007699297502523739, "loss": 3.9791, "step": 2962 }, { "epoch": 0.34, "grad_norm": 1.7913969955625864, "learning_rate": 0.0007697734216879302, "loss": 3.9601, "step": 2963 }, { "epoch": 0.34, "grad_norm": 0.7536513026369986, "learning_rate": 0.0007696170559133853, "loss": 3.8034, "step": 2964 }, { "epoch": 0.34, "grad_norm": 0.7583983320214901, "learning_rate": 0.000769460652950307, "loss": 3.9985, "step": 2965 }, { "epoch": 0.34, "grad_norm": 0.8386557749609159, "learning_rate": 0.0007693042128202679, "loss": 3.8334, "step": 2966 }, { "epoch": 0.34, "grad_norm": 0.9003381509455561, "learning_rate": 0.0007691477355448461, "loss": 3.7907, "step": 2967 }, { "epoch": 0.34, "grad_norm": 0.76334686439675, "learning_rate": 0.0007689912211456247, "loss": 3.7665, "step": 2968 }, { "epoch": 0.34, "grad_norm": 1.257008968103748, "learning_rate": 0.0007688346696441917, "loss": 3.9964, "step": 2969 }, { "epoch": 0.34, "grad_norm": 0.8281195995983583, "learning_rate": 0.0007686780810621406, "loss": 3.8956, "step": 2970 }, { "epoch": 0.34, "grad_norm": 0.7725818835325213, "learning_rate": 0.0007685214554210693, "loss": 3.89, "step": 2971 }, { "epoch": 0.34, "grad_norm": 0.8004885951809324, "learning_rate": 0.0007683647927425821, "loss": 3.8949, "step": 2972 }, { "epoch": 0.34, "grad_norm": 0.7934870909957626, "learning_rate": 0.0007682080930482871, "loss": 3.9479, "step": 2973 }, { "epoch": 0.34, "grad_norm": 0.9156712306189136, "learning_rate": 0.0007680513563597982, "loss": 3.8559, "step": 2974 }, { "epoch": 0.34, "grad_norm": 2.180796506092128, "learning_rate": 0.0007678945826987343, "loss": 4.1406, "step": 2975 }, { "epoch": 0.34, "grad_norm": 0.8526626528663902, "learning_rate": 0.0007677377720867189, "loss": 4.175, "step": 2976 }, { "epoch": 0.34, "grad_norm": 1.3389952498440083, "learning_rate": 0.0007675809245453818, "loss": 3.9473, "step": 2977 }, { "epoch": 0.34, "grad_norm": 1.0100353497932466, "learning_rate": 0.0007674240400963566, "loss": 3.7623, "step": 2978 }, { "epoch": 0.34, "grad_norm": 0.7655756918841731, "learning_rate": 0.0007672671187612826, "loss": 3.7923, "step": 2979 }, { "epoch": 0.34, "grad_norm": 0.9132683628187853, "learning_rate": 0.0007671101605618041, "loss": 4.0524, "step": 2980 }, { "epoch": 0.34, "grad_norm": 0.7921889104793208, "learning_rate": 0.0007669531655195705, "loss": 3.8596, "step": 2981 }, { "epoch": 0.34, "grad_norm": 0.8211284760048864, "learning_rate": 0.0007667961336562364, "loss": 3.8549, "step": 2982 }, { "epoch": 0.34, "grad_norm": 0.9199432023187821, "learning_rate": 0.0007666390649934612, "loss": 4.0639, "step": 2983 }, { "epoch": 0.34, "grad_norm": 0.7854459559781342, "learning_rate": 0.0007664819595529095, "loss": 4.0108, "step": 2984 }, { "epoch": 0.34, "grad_norm": 0.7303724456429235, "learning_rate": 0.0007663248173562513, "loss": 3.8437, "step": 2985 }, { "epoch": 0.34, "grad_norm": 0.7040190681756935, "learning_rate": 0.000766167638425161, "loss": 3.8134, "step": 2986 }, { "epoch": 0.34, "grad_norm": 0.7365329355425521, "learning_rate": 0.0007660104227813188, "loss": 3.8334, "step": 2987 }, { "epoch": 0.34, "grad_norm": 0.8189182599476829, "learning_rate": 0.0007658531704464092, "loss": 3.9678, "step": 2988 }, { "epoch": 0.34, "grad_norm": 0.8297478248491008, "learning_rate": 0.0007656958814421225, "loss": 3.7614, "step": 2989 }, { "epoch": 0.34, "grad_norm": 0.9350297851463709, "learning_rate": 0.0007655385557901534, "loss": 3.8068, "step": 2990 }, { "epoch": 0.34, "grad_norm": 0.8594299995847787, "learning_rate": 0.0007653811935122022, "loss": 4.1316, "step": 2991 }, { "epoch": 0.34, "grad_norm": 0.8960493382977537, "learning_rate": 0.0007652237946299741, "loss": 4.2084, "step": 2992 }, { "epoch": 0.34, "grad_norm": 1.2404132037442457, "learning_rate": 0.000765066359165179, "loss": 3.9282, "step": 2993 }, { "epoch": 0.34, "grad_norm": 0.9137810328072783, "learning_rate": 0.0007649088871395324, "loss": 3.9315, "step": 2994 }, { "epoch": 0.34, "grad_norm": 1.043468082579711, "learning_rate": 0.0007647513785747545, "loss": 4.1007, "step": 2995 }, { "epoch": 0.34, "grad_norm": 0.8494596296098305, "learning_rate": 0.0007645938334925704, "loss": 3.9095, "step": 2996 }, { "epoch": 0.34, "grad_norm": 0.8043187345250739, "learning_rate": 0.0007644362519147106, "loss": 3.9171, "step": 2997 }, { "epoch": 0.34, "grad_norm": 0.687336536827296, "learning_rate": 0.0007642786338629106, "loss": 3.9867, "step": 2998 }, { "epoch": 0.34, "grad_norm": 0.7506319447646895, "learning_rate": 0.0007641209793589105, "loss": 4.0652, "step": 2999 }, { "epoch": 0.34, "grad_norm": 0.7072485676855512, "learning_rate": 0.0007639632884244561, "loss": 4.0171, "step": 3000 }, { "epoch": 0.34, "grad_norm": 0.7568744451040508, "learning_rate": 0.0007638055610812974, "loss": 3.7548, "step": 3001 }, { "epoch": 0.34, "grad_norm": 0.8426975834081122, "learning_rate": 0.0007636477973511903, "loss": 4.0224, "step": 3002 }, { "epoch": 0.34, "grad_norm": 0.9004817997692546, "learning_rate": 0.0007634899972558951, "loss": 3.8699, "step": 3003 }, { "epoch": 0.34, "grad_norm": 0.8169066198598964, "learning_rate": 0.0007633321608171774, "loss": 3.8451, "step": 3004 }, { "epoch": 0.34, "grad_norm": 0.7524365041376977, "learning_rate": 0.0007631742880568075, "loss": 3.8263, "step": 3005 }, { "epoch": 0.34, "grad_norm": 0.9959690822281467, "learning_rate": 0.0007630163789965614, "loss": 4.2703, "step": 3006 }, { "epoch": 0.34, "grad_norm": 2.1383531716793738, "learning_rate": 0.0007628584336582192, "loss": 4.1009, "step": 3007 }, { "epoch": 0.34, "grad_norm": 0.9699829622167304, "learning_rate": 0.0007627004520635666, "loss": 4.0441, "step": 3008 }, { "epoch": 0.35, "grad_norm": 0.8136473825547916, "learning_rate": 0.0007625424342343943, "loss": 3.7512, "step": 3009 }, { "epoch": 0.35, "grad_norm": 0.7878253954472447, "learning_rate": 0.0007623843801924977, "loss": 3.9893, "step": 3010 }, { "epoch": 0.35, "grad_norm": 1.7563809605855127, "learning_rate": 0.0007622262899596772, "loss": 3.8078, "step": 3011 }, { "epoch": 0.35, "grad_norm": 0.7333622863688614, "learning_rate": 0.0007620681635577386, "loss": 4.0582, "step": 3012 }, { "epoch": 0.35, "grad_norm": 0.9081769545050702, "learning_rate": 0.0007619100010084923, "loss": 3.9202, "step": 3013 }, { "epoch": 0.35, "grad_norm": 0.7032132014860292, "learning_rate": 0.0007617518023337538, "loss": 3.732, "step": 3014 }, { "epoch": 0.35, "grad_norm": 0.8883923190829246, "learning_rate": 0.0007615935675553436, "loss": 3.9387, "step": 3015 }, { "epoch": 0.35, "grad_norm": 0.869309227211105, "learning_rate": 0.0007614352966950871, "loss": 4.0354, "step": 3016 }, { "epoch": 0.35, "grad_norm": 2.104976800890115, "learning_rate": 0.000761276989774815, "loss": 3.9425, "step": 3017 }, { "epoch": 0.35, "grad_norm": 2.53155839461406, "learning_rate": 0.0007611186468163625, "loss": 3.8651, "step": 3018 }, { "epoch": 0.35, "grad_norm": 0.8473430785583483, "learning_rate": 0.0007609602678415699, "loss": 3.7968, "step": 3019 }, { "epoch": 0.35, "grad_norm": 0.8364637039010125, "learning_rate": 0.0007608018528722829, "loss": 3.6283, "step": 3020 }, { "epoch": 0.35, "grad_norm": 0.8303141737137273, "learning_rate": 0.0007606434019303514, "loss": 4.0719, "step": 3021 }, { "epoch": 0.35, "grad_norm": 0.8373557767884118, "learning_rate": 0.0007604849150376311, "loss": 4.054, "step": 3022 }, { "epoch": 0.35, "grad_norm": 0.9621756700039185, "learning_rate": 0.0007603263922159822, "loss": 3.9521, "step": 3023 }, { "epoch": 0.35, "grad_norm": 0.9094388891996484, "learning_rate": 0.0007601678334872695, "loss": 4.036, "step": 3024 }, { "epoch": 0.35, "grad_norm": 0.9985052366245435, "learning_rate": 0.0007600092388733635, "loss": 4.1684, "step": 3025 }, { "epoch": 0.35, "grad_norm": 0.9283488279693541, "learning_rate": 0.0007598506083961394, "loss": 4.2164, "step": 3026 }, { "epoch": 0.35, "grad_norm": 1.0816180725671583, "learning_rate": 0.0007596919420774768, "loss": 4.0991, "step": 3027 }, { "epoch": 0.35, "grad_norm": 0.7601158574734403, "learning_rate": 0.0007595332399392611, "loss": 3.9531, "step": 3028 }, { "epoch": 0.35, "grad_norm": 0.9748301828256695, "learning_rate": 0.0007593745020033822, "loss": 3.9099, "step": 3029 }, { "epoch": 0.35, "grad_norm": 0.896406073322048, "learning_rate": 0.0007592157282917347, "loss": 4.062, "step": 3030 }, { "epoch": 0.35, "grad_norm": 0.8389664045915394, "learning_rate": 0.0007590569188262186, "loss": 3.9929, "step": 3031 }, { "epoch": 0.35, "grad_norm": 0.7489243685259497, "learning_rate": 0.0007588980736287389, "loss": 3.9883, "step": 3032 }, { "epoch": 0.35, "grad_norm": 0.4908611445582975, "learning_rate": 0.0007587391927212046, "loss": 4.088, "step": 3033 }, { "epoch": 0.35, "grad_norm": 0.6560772673690601, "learning_rate": 0.0007585802761255309, "loss": 3.8892, "step": 3034 }, { "epoch": 0.35, "grad_norm": 0.7167553215820789, "learning_rate": 0.0007584213238636372, "loss": 3.7584, "step": 3035 }, { "epoch": 0.35, "grad_norm": 0.7082980666049122, "learning_rate": 0.0007582623359574476, "loss": 3.8304, "step": 3036 }, { "epoch": 0.35, "grad_norm": 0.975168455171737, "learning_rate": 0.0007581033124288918, "loss": 3.9157, "step": 3037 }, { "epoch": 0.35, "grad_norm": 0.7809516229223278, "learning_rate": 0.0007579442532999039, "loss": 3.7467, "step": 3038 }, { "epoch": 0.35, "grad_norm": 0.7190620996306497, "learning_rate": 0.0007577851585924231, "loss": 3.8291, "step": 3039 }, { "epoch": 0.35, "grad_norm": 1.339859376783794, "learning_rate": 0.0007576260283283935, "loss": 4.0776, "step": 3040 }, { "epoch": 0.35, "grad_norm": 0.8824407109800702, "learning_rate": 0.0007574668625297641, "loss": 3.6608, "step": 3041 }, { "epoch": 0.35, "grad_norm": 0.7101953982557019, "learning_rate": 0.000757307661218489, "loss": 3.8763, "step": 3042 }, { "epoch": 0.35, "grad_norm": 1.2530188753850788, "learning_rate": 0.0007571484244165266, "loss": 4.0226, "step": 3043 }, { "epoch": 0.35, "grad_norm": 0.7832920891285563, "learning_rate": 0.0007569891521458405, "loss": 4.0326, "step": 3044 }, { "epoch": 0.35, "grad_norm": 0.6568436419009114, "learning_rate": 0.0007568298444283999, "loss": 3.7683, "step": 3045 }, { "epoch": 0.35, "grad_norm": 0.8854080280748612, "learning_rate": 0.0007566705012861777, "loss": 3.7149, "step": 3046 }, { "epoch": 0.35, "grad_norm": 0.6767146680658095, "learning_rate": 0.0007565111227411524, "loss": 3.8269, "step": 3047 }, { "epoch": 0.35, "grad_norm": 0.7244322443161322, "learning_rate": 0.0007563517088153074, "loss": 3.9321, "step": 3048 }, { "epoch": 0.35, "grad_norm": 1.6947820502176265, "learning_rate": 0.0007561922595306305, "loss": 4.2037, "step": 3049 }, { "epoch": 0.35, "grad_norm": 0.8583708942292468, "learning_rate": 0.000756032774909115, "loss": 4.1302, "step": 3050 }, { "epoch": 0.35, "grad_norm": 3.7727616471342067, "learning_rate": 0.0007558732549727586, "loss": 3.9912, "step": 3051 }, { "epoch": 0.35, "grad_norm": 0.9152919179890534, "learning_rate": 0.0007557136997435641, "loss": 3.9228, "step": 3052 }, { "epoch": 0.35, "grad_norm": 1.045831402346733, "learning_rate": 0.000755554109243539, "loss": 3.8593, "step": 3053 }, { "epoch": 0.35, "grad_norm": 0.6859013341857966, "learning_rate": 0.0007553944834946958, "loss": 3.7818, "step": 3054 }, { "epoch": 0.35, "grad_norm": 0.8171094387718647, "learning_rate": 0.0007552348225190519, "loss": 3.9794, "step": 3055 }, { "epoch": 0.35, "grad_norm": 0.7625613521025649, "learning_rate": 0.0007550751263386295, "loss": 3.8417, "step": 3056 }, { "epoch": 0.35, "grad_norm": 0.7841858464700694, "learning_rate": 0.0007549153949754557, "loss": 3.9837, "step": 3057 }, { "epoch": 0.35, "grad_norm": 0.7517253058449689, "learning_rate": 0.0007547556284515621, "loss": 3.8835, "step": 3058 }, { "epoch": 0.35, "grad_norm": 0.8507112108001218, "learning_rate": 0.0007545958267889856, "loss": 4.0073, "step": 3059 }, { "epoch": 0.35, "grad_norm": 0.7005477079457738, "learning_rate": 0.0007544359900097681, "loss": 3.823, "step": 3060 }, { "epoch": 0.35, "grad_norm": 0.8050167745888608, "learning_rate": 0.0007542761181359556, "loss": 4.0794, "step": 3061 }, { "epoch": 0.35, "grad_norm": 0.7021981240251157, "learning_rate": 0.0007541162111895994, "loss": 3.8468, "step": 3062 }, { "epoch": 0.35, "grad_norm": 0.9553900313240168, "learning_rate": 0.000753956269192756, "loss": 3.9106, "step": 3063 }, { "epoch": 0.35, "grad_norm": 0.7633126765470855, "learning_rate": 0.0007537962921674861, "loss": 4.0844, "step": 3064 }, { "epoch": 0.35, "grad_norm": 0.7706291154588064, "learning_rate": 0.0007536362801358554, "loss": 4.022, "step": 3065 }, { "epoch": 0.35, "grad_norm": 0.9428771432155388, "learning_rate": 0.0007534762331199345, "loss": 4.0319, "step": 3066 }, { "epoch": 0.35, "grad_norm": 0.7160906265580018, "learning_rate": 0.0007533161511417992, "loss": 3.939, "step": 3067 }, { "epoch": 0.35, "grad_norm": 0.6872387385717094, "learning_rate": 0.0007531560342235293, "loss": 3.9288, "step": 3068 }, { "epoch": 0.35, "grad_norm": 1.1970262727669907, "learning_rate": 0.0007529958823872101, "loss": 3.7959, "step": 3069 }, { "epoch": 0.35, "grad_norm": 1.1110325741543214, "learning_rate": 0.0007528356956549316, "loss": 3.9563, "step": 3070 }, { "epoch": 0.35, "grad_norm": 0.8915116113776023, "learning_rate": 0.0007526754740487881, "loss": 3.995, "step": 3071 }, { "epoch": 0.35, "grad_norm": 1.7382914697889293, "learning_rate": 0.0007525152175908796, "loss": 3.7591, "step": 3072 }, { "epoch": 0.35, "grad_norm": 0.9758499919319071, "learning_rate": 0.0007523549263033103, "loss": 3.9168, "step": 3073 }, { "epoch": 0.35, "grad_norm": 0.694402823820264, "learning_rate": 0.000752194600208189, "loss": 3.9819, "step": 3074 }, { "epoch": 0.35, "grad_norm": 0.7997408756703108, "learning_rate": 0.00075203423932763, "loss": 3.5951, "step": 3075 }, { "epoch": 0.35, "grad_norm": 0.7080981008466017, "learning_rate": 0.0007518738436837518, "loss": 4.1161, "step": 3076 }, { "epoch": 0.35, "grad_norm": 0.7607462515883542, "learning_rate": 0.000751713413298678, "loss": 3.9088, "step": 3077 }, { "epoch": 0.35, "grad_norm": 0.8422111564654339, "learning_rate": 0.0007515529481945372, "loss": 4.0031, "step": 3078 }, { "epoch": 0.35, "grad_norm": 0.7011243087702548, "learning_rate": 0.000751392448393462, "loss": 3.9814, "step": 3079 }, { "epoch": 0.35, "grad_norm": 1.0711827867213224, "learning_rate": 0.0007512319139175905, "loss": 3.9544, "step": 3080 }, { "epoch": 0.35, "grad_norm": 0.7232714753015131, "learning_rate": 0.0007510713447890653, "loss": 4.0006, "step": 3081 }, { "epoch": 0.35, "grad_norm": 2.3033033333694086, "learning_rate": 0.0007509107410300342, "loss": 3.8255, "step": 3082 }, { "epoch": 0.35, "grad_norm": 0.7578899850242595, "learning_rate": 0.0007507501026626491, "loss": 3.8752, "step": 3083 }, { "epoch": 0.35, "grad_norm": 1.1720208894202784, "learning_rate": 0.000750589429709067, "loss": 3.8161, "step": 3084 }, { "epoch": 0.35, "grad_norm": 0.7903348609240856, "learning_rate": 0.0007504287221914499, "loss": 3.947, "step": 3085 }, { "epoch": 0.35, "grad_norm": 2.0754117140428554, "learning_rate": 0.0007502679801319641, "loss": 3.9204, "step": 3086 }, { "epoch": 0.35, "grad_norm": 1.1592786236846833, "learning_rate": 0.0007501072035527807, "loss": 3.7849, "step": 3087 }, { "epoch": 0.35, "grad_norm": 0.6758083590327713, "learning_rate": 0.0007499463924760764, "loss": 3.8991, "step": 3088 }, { "epoch": 0.35, "grad_norm": 0.7852635857892741, "learning_rate": 0.0007497855469240316, "loss": 4.0954, "step": 3089 }, { "epoch": 0.35, "grad_norm": 0.9737142954409109, "learning_rate": 0.0007496246669188319, "loss": 3.8615, "step": 3090 }, { "epoch": 0.35, "grad_norm": 0.8055824494647531, "learning_rate": 0.0007494637524826677, "loss": 3.7533, "step": 3091 }, { "epoch": 0.35, "grad_norm": 0.9108048422795378, "learning_rate": 0.000749302803637734, "loss": 3.8841, "step": 3092 }, { "epoch": 0.35, "grad_norm": 1.2019983643152932, "learning_rate": 0.0007491418204062307, "loss": 4.1264, "step": 3093 }, { "epoch": 0.35, "grad_norm": 0.8169951239240373, "learning_rate": 0.0007489808028103622, "loss": 3.9659, "step": 3094 }, { "epoch": 0.35, "grad_norm": 0.8432058685824899, "learning_rate": 0.0007488197508723383, "loss": 3.8021, "step": 3095 }, { "epoch": 0.36, "grad_norm": 0.7874335603748958, "learning_rate": 0.0007486586646143725, "loss": 4.0614, "step": 3096 }, { "epoch": 0.36, "grad_norm": 1.06822558352207, "learning_rate": 0.0007484975440586838, "loss": 4.0983, "step": 3097 }, { "epoch": 0.36, "grad_norm": 0.7219634431370882, "learning_rate": 0.0007483363892274958, "loss": 3.9016, "step": 3098 }, { "epoch": 0.36, "grad_norm": 2.1007781342941483, "learning_rate": 0.0007481752001430364, "loss": 4.0096, "step": 3099 }, { "epoch": 0.36, "grad_norm": 0.6915312617632342, "learning_rate": 0.000748013976827539, "loss": 3.8925, "step": 3100 }, { "epoch": 0.36, "grad_norm": 0.749642186603448, "learning_rate": 0.0007478527193032409, "loss": 3.7712, "step": 3101 }, { "epoch": 0.36, "grad_norm": 1.660094026150948, "learning_rate": 0.0007476914275923848, "loss": 3.742, "step": 3102 }, { "epoch": 0.36, "grad_norm": 0.8443055383996099, "learning_rate": 0.0007475301017172177, "loss": 3.9026, "step": 3103 }, { "epoch": 0.36, "grad_norm": 0.7752507762412318, "learning_rate": 0.0007473687416999913, "loss": 4.205, "step": 3104 }, { "epoch": 0.36, "grad_norm": 0.8024804596967614, "learning_rate": 0.0007472073475629624, "loss": 3.8433, "step": 3105 }, { "epoch": 0.36, "grad_norm": 1.019909347838662, "learning_rate": 0.000747045919328392, "loss": 3.8949, "step": 3106 }, { "epoch": 0.36, "grad_norm": 0.7967308592686851, "learning_rate": 0.0007468844570185462, "loss": 4.0036, "step": 3107 }, { "epoch": 0.36, "grad_norm": 1.1522462200228638, "learning_rate": 0.0007467229606556955, "loss": 3.987, "step": 3108 }, { "epoch": 0.36, "grad_norm": 0.8647817155514017, "learning_rate": 0.0007465614302621155, "loss": 4.0021, "step": 3109 }, { "epoch": 0.36, "grad_norm": 2.7068808450901347, "learning_rate": 0.0007463998658600861, "loss": 3.75, "step": 3110 }, { "epoch": 0.36, "grad_norm": 0.8198909621873318, "learning_rate": 0.0007462382674718919, "loss": 3.8772, "step": 3111 }, { "epoch": 0.36, "grad_norm": 0.7374761870382877, "learning_rate": 0.0007460766351198225, "loss": 3.8594, "step": 3112 }, { "epoch": 0.36, "grad_norm": 0.8256654074137212, "learning_rate": 0.0007459149688261719, "loss": 4.0258, "step": 3113 }, { "epoch": 0.36, "grad_norm": 0.7727670549827007, "learning_rate": 0.0007457532686132389, "loss": 3.8579, "step": 3114 }, { "epoch": 0.36, "grad_norm": 1.0031070860080882, "learning_rate": 0.0007455915345033271, "loss": 3.9712, "step": 3115 }, { "epoch": 0.36, "grad_norm": 0.7656659480460292, "learning_rate": 0.0007454297665187442, "loss": 3.8029, "step": 3116 }, { "epoch": 0.36, "grad_norm": 0.9754790228799204, "learning_rate": 0.0007452679646818037, "loss": 3.9451, "step": 3117 }, { "epoch": 0.36, "grad_norm": 0.7917691542787434, "learning_rate": 0.0007451061290148224, "loss": 3.9658, "step": 3118 }, { "epoch": 0.36, "grad_norm": 0.9382090167353032, "learning_rate": 0.000744944259540123, "loss": 4.0586, "step": 3119 }, { "epoch": 0.36, "grad_norm": 0.8371645892532928, "learning_rate": 0.000744782356280032, "loss": 3.8716, "step": 3120 }, { "epoch": 0.36, "grad_norm": 0.7835029463219834, "learning_rate": 0.0007446204192568807, "loss": 3.7908, "step": 3121 }, { "epoch": 0.36, "grad_norm": 0.8977910416604644, "learning_rate": 0.0007444584484930057, "loss": 3.9718, "step": 3122 }, { "epoch": 0.36, "grad_norm": 1.0527767053985106, "learning_rate": 0.0007442964440107476, "loss": 3.956, "step": 3123 }, { "epoch": 0.36, "grad_norm": 0.8024902538819312, "learning_rate": 0.0007441344058324515, "loss": 4.0293, "step": 3124 }, { "epoch": 0.36, "grad_norm": 0.6981485704593454, "learning_rate": 0.0007439723339804679, "loss": 3.7, "step": 3125 }, { "epoch": 0.36, "grad_norm": 0.9065655927900654, "learning_rate": 0.0007438102284771513, "loss": 4.0862, "step": 3126 }, { "epoch": 0.36, "grad_norm": 0.8574143902594201, "learning_rate": 0.0007436480893448611, "loss": 3.9624, "step": 3127 }, { "epoch": 0.36, "grad_norm": 1.0570946493046487, "learning_rate": 0.0007434859166059616, "loss": 3.923, "step": 3128 }, { "epoch": 0.36, "grad_norm": 1.1873984954104526, "learning_rate": 0.0007433237102828209, "loss": 3.8631, "step": 3129 }, { "epoch": 0.36, "grad_norm": 1.954586600857509, "learning_rate": 0.0007431614703978125, "loss": 3.9764, "step": 3130 }, { "epoch": 0.36, "grad_norm": 0.7351728628046419, "learning_rate": 0.0007429991969733144, "loss": 3.6665, "step": 3131 }, { "epoch": 0.36, "grad_norm": 0.8165431557761316, "learning_rate": 0.0007428368900317092, "loss": 3.7553, "step": 3132 }, { "epoch": 0.36, "grad_norm": 2.55577118336618, "learning_rate": 0.0007426745495953838, "loss": 3.8041, "step": 3133 }, { "epoch": 0.36, "grad_norm": 0.7676073226383014, "learning_rate": 0.00074251217568673, "loss": 3.9481, "step": 3134 }, { "epoch": 0.36, "grad_norm": 0.6627287510228065, "learning_rate": 0.0007423497683281444, "loss": 3.7552, "step": 3135 }, { "epoch": 0.36, "grad_norm": 1.3062759783765603, "learning_rate": 0.0007421873275420277, "loss": 4.065, "step": 3136 }, { "epoch": 0.36, "grad_norm": 0.8683106415343326, "learning_rate": 0.0007420248533507858, "loss": 4.0374, "step": 3137 }, { "epoch": 0.36, "grad_norm": 2.33644207151727, "learning_rate": 0.000741862345776829, "loss": 3.783, "step": 3138 }, { "epoch": 0.36, "grad_norm": 0.7671899799859807, "learning_rate": 0.0007416998048425716, "loss": 4.02, "step": 3139 }, { "epoch": 0.36, "grad_norm": 0.7477054363158474, "learning_rate": 0.0007415372305704334, "loss": 3.9891, "step": 3140 }, { "epoch": 0.36, "grad_norm": 0.903085347824984, "learning_rate": 0.0007413746229828384, "loss": 4.0231, "step": 3141 }, { "epoch": 0.36, "grad_norm": 1.5212810031183515, "learning_rate": 0.0007412119821022153, "loss": 3.6641, "step": 3142 }, { "epoch": 0.36, "grad_norm": 0.888375812998574, "learning_rate": 0.000741049307950997, "loss": 3.7996, "step": 3143 }, { "epoch": 0.36, "grad_norm": 0.8901911880717717, "learning_rate": 0.0007408866005516215, "loss": 3.8594, "step": 3144 }, { "epoch": 0.36, "grad_norm": 3.6728021503078563, "learning_rate": 0.0007407238599265313, "loss": 3.8515, "step": 3145 }, { "epoch": 0.36, "grad_norm": 4.5593518025291555, "learning_rate": 0.0007405610860981731, "loss": 3.8181, "step": 3146 }, { "epoch": 0.36, "grad_norm": 0.9133075421222895, "learning_rate": 0.0007403982790889987, "loss": 3.6731, "step": 3147 }, { "epoch": 0.36, "grad_norm": 0.9256552215607333, "learning_rate": 0.0007402354389214642, "loss": 3.9844, "step": 3148 }, { "epoch": 0.36, "grad_norm": 0.8876713678891404, "learning_rate": 0.0007400725656180298, "loss": 4.0332, "step": 3149 }, { "epoch": 0.36, "grad_norm": 1.5130116124570052, "learning_rate": 0.0007399096592011616, "loss": 3.9372, "step": 3150 }, { "epoch": 0.36, "grad_norm": 0.853638722623953, "learning_rate": 0.0007397467196933287, "loss": 4.004, "step": 3151 }, { "epoch": 0.36, "grad_norm": 0.8057886203292006, "learning_rate": 0.000739583747117006, "loss": 3.6706, "step": 3152 }, { "epoch": 0.36, "grad_norm": 1.0608120922075939, "learning_rate": 0.0007394207414946723, "loss": 3.815, "step": 3153 }, { "epoch": 0.36, "grad_norm": 0.738727571189008, "learning_rate": 0.0007392577028488109, "loss": 3.7535, "step": 3154 }, { "epoch": 0.36, "grad_norm": 1.4429317506139767, "learning_rate": 0.0007390946312019102, "loss": 3.7135, "step": 3155 }, { "epoch": 0.36, "grad_norm": 0.8411548611101233, "learning_rate": 0.0007389315265764626, "loss": 3.8632, "step": 3156 }, { "epoch": 0.36, "grad_norm": 0.7641403495492104, "learning_rate": 0.0007387683889949655, "loss": 3.9677, "step": 3157 }, { "epoch": 0.36, "grad_norm": 1.1341301689495917, "learning_rate": 0.0007386052184799204, "loss": 4.209, "step": 3158 }, { "epoch": 0.36, "grad_norm": 0.8604416435688076, "learning_rate": 0.0007384420150538336, "loss": 3.6821, "step": 3159 }, { "epoch": 0.36, "grad_norm": 0.7928202627588236, "learning_rate": 0.000738278778739216, "loss": 4.0018, "step": 3160 }, { "epoch": 0.36, "grad_norm": 0.6494677466916094, "learning_rate": 0.0007381155095585827, "loss": 3.7879, "step": 3161 }, { "epoch": 0.36, "grad_norm": 1.0749660215216255, "learning_rate": 0.000737952207534454, "loss": 3.9171, "step": 3162 }, { "epoch": 0.36, "grad_norm": 2.2975662081911907, "learning_rate": 0.000737788872689354, "loss": 4.0104, "step": 3163 }, { "epoch": 0.36, "grad_norm": 0.7875825037735569, "learning_rate": 0.0007376255050458116, "loss": 3.9747, "step": 3164 }, { "epoch": 0.36, "grad_norm": 0.8783661140955957, "learning_rate": 0.0007374621046263603, "loss": 3.7191, "step": 3165 }, { "epoch": 0.36, "grad_norm": 0.7899110100566018, "learning_rate": 0.0007372986714535381, "loss": 4.0875, "step": 3166 }, { "epoch": 0.36, "grad_norm": 1.050651599498055, "learning_rate": 0.0007371352055498876, "loss": 4.0638, "step": 3167 }, { "epoch": 0.36, "grad_norm": 1.205548968869648, "learning_rate": 0.0007369717069379558, "loss": 3.9114, "step": 3168 }, { "epoch": 0.36, "grad_norm": 3.0209731694913486, "learning_rate": 0.0007368081756402939, "loss": 3.9157, "step": 3169 }, { "epoch": 0.36, "grad_norm": 1.1710717332246532, "learning_rate": 0.0007366446116794583, "loss": 4.0332, "step": 3170 }, { "epoch": 0.36, "grad_norm": 1.1101213779899115, "learning_rate": 0.0007364810150780091, "loss": 4.0872, "step": 3171 }, { "epoch": 0.36, "grad_norm": 0.7501906612137454, "learning_rate": 0.0007363173858585119, "loss": 3.9879, "step": 3172 }, { "epoch": 0.36, "grad_norm": 0.8527474573048001, "learning_rate": 0.000736153724043536, "loss": 3.995, "step": 3173 }, { "epoch": 0.36, "grad_norm": 1.5659669821605355, "learning_rate": 0.000735990029655655, "loss": 4.0501, "step": 3174 }, { "epoch": 0.36, "grad_norm": 0.7466989692730817, "learning_rate": 0.0007358263027174481, "loss": 3.8971, "step": 3175 }, { "epoch": 0.36, "grad_norm": 0.7327415048274, "learning_rate": 0.0007356625432514979, "loss": 3.9427, "step": 3176 }, { "epoch": 0.36, "grad_norm": 1.0650199293385276, "learning_rate": 0.000735498751280392, "loss": 4.1231, "step": 3177 }, { "epoch": 0.36, "grad_norm": 0.7742049244746513, "learning_rate": 0.0007353349268267224, "loss": 3.7813, "step": 3178 }, { "epoch": 0.36, "grad_norm": 1.1548795559868676, "learning_rate": 0.0007351710699130856, "loss": 3.7294, "step": 3179 }, { "epoch": 0.36, "grad_norm": 2.0861410823008986, "learning_rate": 0.0007350071805620823, "loss": 3.68, "step": 3180 }, { "epoch": 0.36, "grad_norm": 0.9654729250798652, "learning_rate": 0.000734843258796318, "loss": 3.9255, "step": 3181 }, { "epoch": 0.36, "grad_norm": 0.9277012308002187, "learning_rate": 0.0007346793046384031, "loss": 3.8071, "step": 3182 }, { "epoch": 0.36, "grad_norm": 1.4484826723712214, "learning_rate": 0.0007345153181109511, "loss": 3.7948, "step": 3183 }, { "epoch": 0.37, "grad_norm": 4.560051725385138, "learning_rate": 0.0007343512992365815, "loss": 3.7773, "step": 3184 }, { "epoch": 0.37, "grad_norm": 0.8223641505884691, "learning_rate": 0.0007341872480379172, "loss": 3.6555, "step": 3185 }, { "epoch": 0.37, "grad_norm": 0.8072325561906724, "learning_rate": 0.0007340231645375861, "loss": 3.8009, "step": 3186 }, { "epoch": 0.37, "grad_norm": 0.831913931302074, "learning_rate": 0.0007338590487582202, "loss": 3.927, "step": 3187 }, { "epoch": 0.37, "grad_norm": 0.7965694378170509, "learning_rate": 0.0007336949007224565, "loss": 3.695, "step": 3188 }, { "epoch": 0.37, "grad_norm": 1.7165092085958433, "learning_rate": 0.0007335307204529356, "loss": 3.9745, "step": 3189 }, { "epoch": 0.37, "grad_norm": 1.0325684291588468, "learning_rate": 0.0007333665079723035, "loss": 3.8982, "step": 3190 }, { "epoch": 0.37, "grad_norm": 1.0851172662650517, "learning_rate": 0.00073320226330321, "loss": 4.084, "step": 3191 }, { "epoch": 0.37, "grad_norm": 1.380479376918727, "learning_rate": 0.0007330379864683096, "loss": 3.8281, "step": 3192 }, { "epoch": 0.37, "grad_norm": 1.008972737476428, "learning_rate": 0.0007328736774902609, "loss": 3.7213, "step": 3193 }, { "epoch": 0.37, "grad_norm": 1.7009085294990036, "learning_rate": 0.0007327093363917274, "loss": 3.8619, "step": 3194 }, { "epoch": 0.37, "grad_norm": 0.8304755075452679, "learning_rate": 0.0007325449631953769, "loss": 3.9651, "step": 3195 }, { "epoch": 0.37, "grad_norm": 1.0055383979848254, "learning_rate": 0.0007323805579238812, "loss": 3.8116, "step": 3196 }, { "epoch": 0.37, "grad_norm": 1.022872287249141, "learning_rate": 0.0007322161205999173, "loss": 3.9313, "step": 3197 }, { "epoch": 0.37, "grad_norm": 0.9525446506162639, "learning_rate": 0.000732051651246166, "loss": 4.1593, "step": 3198 }, { "epoch": 0.37, "grad_norm": 1.0274440757856111, "learning_rate": 0.0007318871498853126, "loss": 3.847, "step": 3199 }, { "epoch": 0.37, "grad_norm": 1.0967798134160094, "learning_rate": 0.0007317226165400473, "loss": 3.755, "step": 3200 }, { "epoch": 0.37, "grad_norm": 0.8251457283156846, "learning_rate": 0.0007315580512330638, "loss": 4.0466, "step": 3201 }, { "epoch": 0.37, "grad_norm": 2.2870898523675227, "learning_rate": 0.000731393453987061, "loss": 3.9202, "step": 3202 }, { "epoch": 0.37, "grad_norm": 1.0645318184318187, "learning_rate": 0.0007312288248247423, "loss": 3.8487, "step": 3203 }, { "epoch": 0.37, "grad_norm": 1.0003683286940261, "learning_rate": 0.0007310641637688147, "loss": 4.2138, "step": 3204 }, { "epoch": 0.37, "grad_norm": 0.9813651185774678, "learning_rate": 0.0007308994708419901, "loss": 3.7441, "step": 3205 }, { "epoch": 0.37, "grad_norm": 2.8091471370024665, "learning_rate": 0.0007307347460669849, "loss": 3.7869, "step": 3206 }, { "epoch": 0.37, "grad_norm": 0.7493697525877414, "learning_rate": 0.0007305699894665196, "loss": 3.9963, "step": 3207 }, { "epoch": 0.37, "grad_norm": 1.5002080774130453, "learning_rate": 0.0007304052010633195, "loss": 3.8152, "step": 3208 }, { "epoch": 0.37, "grad_norm": 0.8415255338415223, "learning_rate": 0.0007302403808801136, "loss": 3.7804, "step": 3209 }, { "epoch": 0.37, "grad_norm": 1.461208616647697, "learning_rate": 0.0007300755289396362, "loss": 3.9818, "step": 3210 }, { "epoch": 0.37, "grad_norm": 0.8979986831354999, "learning_rate": 0.000729910645264625, "loss": 3.7804, "step": 3211 }, { "epoch": 0.37, "grad_norm": 1.1059875930330487, "learning_rate": 0.000729745729877823, "loss": 3.9757, "step": 3212 }, { "epoch": 0.37, "grad_norm": 1.0536036023156643, "learning_rate": 0.0007295807828019767, "loss": 3.8122, "step": 3213 }, { "epoch": 0.37, "grad_norm": 0.8195665810248793, "learning_rate": 0.0007294158040598378, "loss": 3.6672, "step": 3214 }, { "epoch": 0.37, "grad_norm": 1.8673563126991413, "learning_rate": 0.0007292507936741616, "loss": 3.8718, "step": 3215 }, { "epoch": 0.37, "grad_norm": 1.0043432319579217, "learning_rate": 0.0007290857516677084, "loss": 4.0646, "step": 3216 }, { "epoch": 0.37, "grad_norm": 0.8965114750029936, "learning_rate": 0.0007289206780632427, "loss": 4.1562, "step": 3217 }, { "epoch": 0.37, "grad_norm": 1.0837033557883926, "learning_rate": 0.0007287555728835329, "loss": 3.7986, "step": 3218 }, { "epoch": 0.37, "grad_norm": 0.7487857070587334, "learning_rate": 0.0007285904361513522, "loss": 3.8377, "step": 3219 }, { "epoch": 0.37, "grad_norm": 0.8135987884354283, "learning_rate": 0.0007284252678894785, "loss": 3.7591, "step": 3220 }, { "epoch": 0.37, "grad_norm": 0.9484419316189594, "learning_rate": 0.0007282600681206929, "loss": 3.7643, "step": 3221 }, { "epoch": 0.37, "grad_norm": 1.5757161738269903, "learning_rate": 0.0007280948368677822, "loss": 3.7707, "step": 3222 }, { "epoch": 0.37, "grad_norm": 0.9370845906945936, "learning_rate": 0.0007279295741535367, "loss": 3.8997, "step": 3223 }, { "epoch": 0.37, "grad_norm": 0.9130452904836538, "learning_rate": 0.0007277642800007509, "loss": 3.7848, "step": 3224 }, { "epoch": 0.37, "grad_norm": 1.6735174719309238, "learning_rate": 0.0007275989544322244, "loss": 4.0007, "step": 3225 }, { "epoch": 0.37, "grad_norm": 1.4411842267745922, "learning_rate": 0.0007274335974707606, "loss": 3.7522, "step": 3226 }, { "epoch": 0.37, "grad_norm": 0.8533610231375114, "learning_rate": 0.0007272682091391671, "loss": 3.8233, "step": 3227 }, { "epoch": 0.37, "grad_norm": 0.7429496318241032, "learning_rate": 0.0007271027894602567, "loss": 3.7481, "step": 3228 }, { "epoch": 0.37, "grad_norm": 0.7139382381640661, "learning_rate": 0.0007269373384568451, "loss": 4.0713, "step": 3229 }, { "epoch": 0.37, "grad_norm": 0.8561097399392396, "learning_rate": 0.0007267718561517535, "loss": 3.8586, "step": 3230 }, { "epoch": 0.37, "grad_norm": 1.0325088764166863, "learning_rate": 0.0007266063425678071, "loss": 3.8476, "step": 3231 }, { "epoch": 0.37, "grad_norm": 2.19495329791432, "learning_rate": 0.0007264407977278354, "loss": 3.9132, "step": 3232 }, { "epoch": 0.37, "grad_norm": 0.7550369251214545, "learning_rate": 0.0007262752216546718, "loss": 3.8993, "step": 3233 }, { "epoch": 0.37, "grad_norm": 2.7512586133066153, "learning_rate": 0.0007261096143711545, "loss": 3.8078, "step": 3234 }, { "epoch": 0.37, "grad_norm": 2.068831378453835, "learning_rate": 0.0007259439759001262, "loss": 3.908, "step": 3235 }, { "epoch": 0.37, "grad_norm": 0.7116210940603965, "learning_rate": 0.0007257783062644333, "loss": 3.9668, "step": 3236 }, { "epoch": 0.37, "grad_norm": 0.969637077404665, "learning_rate": 0.0007256126054869265, "loss": 3.857, "step": 3237 }, { "epoch": 0.37, "grad_norm": 1.0896954719338734, "learning_rate": 0.0007254468735904616, "loss": 4.05, "step": 3238 }, { "epoch": 0.37, "grad_norm": 0.9614638172823644, "learning_rate": 0.0007252811105978977, "loss": 3.8354, "step": 3239 }, { "epoch": 0.37, "grad_norm": 1.9251147245838762, "learning_rate": 0.000725115316532099, "loss": 4.0939, "step": 3240 }, { "epoch": 0.37, "grad_norm": 0.733887373119731, "learning_rate": 0.0007249494914159332, "loss": 3.9811, "step": 3241 }, { "epoch": 0.37, "grad_norm": 1.8195089209434936, "learning_rate": 0.0007247836352722733, "loss": 3.9549, "step": 3242 }, { "epoch": 0.37, "grad_norm": 1.8250681120069694, "learning_rate": 0.0007246177481239956, "loss": 3.7505, "step": 3243 }, { "epoch": 0.37, "grad_norm": 0.8119100058017044, "learning_rate": 0.0007244518299939811, "loss": 4.1218, "step": 3244 }, { "epoch": 0.37, "grad_norm": 1.9493760154965782, "learning_rate": 0.0007242858809051152, "loss": 4.0552, "step": 3245 }, { "epoch": 0.37, "grad_norm": 2.139126973383629, "learning_rate": 0.000724119900880287, "loss": 3.7481, "step": 3246 }, { "epoch": 0.37, "grad_norm": 0.6998528107996224, "learning_rate": 0.000723953889942391, "loss": 3.9417, "step": 3247 }, { "epoch": 0.37, "grad_norm": 1.3882857700632552, "learning_rate": 0.0007237878481143246, "loss": 4.0706, "step": 3248 }, { "epoch": 0.37, "grad_norm": 0.9444531147056593, "learning_rate": 0.0007236217754189903, "loss": 4.1516, "step": 3249 }, { "epoch": 0.37, "grad_norm": 1.0081946689119503, "learning_rate": 0.0007234556718792948, "loss": 3.8336, "step": 3250 }, { "epoch": 0.37, "grad_norm": 1.1121353320113967, "learning_rate": 0.0007232895375181488, "loss": 3.8096, "step": 3251 }, { "epoch": 0.37, "grad_norm": 0.9173874108620995, "learning_rate": 0.0007231233723584674, "loss": 3.8035, "step": 3252 }, { "epoch": 0.37, "grad_norm": 0.7488706478535706, "learning_rate": 0.0007229571764231699, "loss": 3.8788, "step": 3253 }, { "epoch": 0.37, "grad_norm": 2.678044885016303, "learning_rate": 0.0007227909497351799, "loss": 4.121, "step": 3254 }, { "epoch": 0.37, "grad_norm": 1.7314120387189975, "learning_rate": 0.000722624692317425, "loss": 4.142, "step": 3255 }, { "epoch": 0.37, "grad_norm": 0.8091243178081582, "learning_rate": 0.0007224584041928374, "loss": 3.7885, "step": 3256 }, { "epoch": 0.37, "grad_norm": 0.8342122787954406, "learning_rate": 0.0007222920853843538, "loss": 3.9114, "step": 3257 }, { "epoch": 0.37, "grad_norm": 1.211288775875769, "learning_rate": 0.0007221257359149139, "loss": 3.9373, "step": 3258 }, { "epoch": 0.37, "grad_norm": 0.8274927665722583, "learning_rate": 0.0007219593558074629, "loss": 4.0622, "step": 3259 }, { "epoch": 0.37, "grad_norm": 1.1036463315448553, "learning_rate": 0.0007217929450849497, "loss": 3.6351, "step": 3260 }, { "epoch": 0.37, "grad_norm": 0.7221440005906028, "learning_rate": 0.0007216265037703276, "loss": 3.9263, "step": 3261 }, { "epoch": 0.37, "grad_norm": 1.2412576528177284, "learning_rate": 0.0007214600318865538, "loss": 3.6767, "step": 3262 }, { "epoch": 0.37, "grad_norm": 1.1566887871267908, "learning_rate": 0.00072129352945659, "loss": 3.988, "step": 3263 }, { "epoch": 0.37, "grad_norm": 1.0434552630573382, "learning_rate": 0.000721126996503402, "loss": 3.9461, "step": 3264 }, { "epoch": 0.37, "grad_norm": 0.6807683066350915, "learning_rate": 0.0007209604330499599, "loss": 3.9233, "step": 3265 }, { "epoch": 0.37, "grad_norm": 2.613368826945423, "learning_rate": 0.000720793839119238, "loss": 3.7894, "step": 3266 }, { "epoch": 0.37, "grad_norm": 2.092240624604098, "learning_rate": 0.0007206272147342147, "loss": 4.124, "step": 3267 }, { "epoch": 0.37, "grad_norm": 3.1696286092703625, "learning_rate": 0.0007204605599178728, "loss": 3.7238, "step": 3268 }, { "epoch": 0.37, "grad_norm": 0.73991890589454, "learning_rate": 0.0007202938746931988, "loss": 3.8067, "step": 3269 }, { "epoch": 0.37, "grad_norm": 0.8233463829583865, "learning_rate": 0.0007201271590831841, "loss": 3.8695, "step": 3270 }, { "epoch": 0.38, "grad_norm": 0.8315997727189681, "learning_rate": 0.0007199604131108237, "loss": 4.0311, "step": 3271 }, { "epoch": 0.38, "grad_norm": 0.9790552325991776, "learning_rate": 0.0007197936367991174, "loss": 3.813, "step": 3272 }, { "epoch": 0.38, "grad_norm": 0.9058836105684835, "learning_rate": 0.0007196268301710684, "loss": 3.9262, "step": 3273 }, { "epoch": 0.38, "grad_norm": 1.3276799031126767, "learning_rate": 0.0007194599932496845, "loss": 3.7759, "step": 3274 }, { "epoch": 0.38, "grad_norm": 0.6922364365308009, "learning_rate": 0.000719293126057978, "loss": 3.7938, "step": 3275 }, { "epoch": 0.38, "grad_norm": 0.7925328137012758, "learning_rate": 0.0007191262286189649, "loss": 3.7541, "step": 3276 }, { "epoch": 0.38, "grad_norm": 0.7136236412314971, "learning_rate": 0.0007189593009556651, "loss": 3.7641, "step": 3277 }, { "epoch": 0.38, "grad_norm": 1.268285229762576, "learning_rate": 0.0007187923430911039, "loss": 3.7698, "step": 3278 }, { "epoch": 0.38, "grad_norm": 0.7603478063417772, "learning_rate": 0.0007186253550483094, "loss": 3.9921, "step": 3279 }, { "epoch": 0.38, "grad_norm": 0.7909113924028127, "learning_rate": 0.0007184583368503146, "loss": 3.8873, "step": 3280 }, { "epoch": 0.38, "grad_norm": 1.104247812873377, "learning_rate": 0.0007182912885201563, "loss": 3.7127, "step": 3281 }, { "epoch": 0.38, "grad_norm": 1.279528040749498, "learning_rate": 0.0007181242100808759, "loss": 4.0655, "step": 3282 }, { "epoch": 0.38, "grad_norm": 0.8443532847338652, "learning_rate": 0.0007179571015555184, "loss": 3.7618, "step": 3283 }, { "epoch": 0.38, "grad_norm": 0.7895544659371456, "learning_rate": 0.0007177899629671335, "loss": 3.9588, "step": 3284 }, { "epoch": 0.38, "grad_norm": 0.7246958847832666, "learning_rate": 0.0007176227943387747, "loss": 3.9795, "step": 3285 }, { "epoch": 0.38, "grad_norm": 0.7559628915040889, "learning_rate": 0.0007174555956934996, "loss": 4.073, "step": 3286 }, { "epoch": 0.38, "grad_norm": 0.9159008771674201, "learning_rate": 0.00071728836705437, "loss": 3.9033, "step": 3287 }, { "epoch": 0.38, "grad_norm": 1.0858859498162203, "learning_rate": 0.0007171211084444525, "loss": 3.7586, "step": 3288 }, { "epoch": 0.38, "grad_norm": 9.645216358139603, "learning_rate": 0.0007169538198868164, "loss": 3.6912, "step": 3289 }, { "epoch": 0.38, "grad_norm": 1.788693198181352, "learning_rate": 0.0007167865014045365, "loss": 3.9072, "step": 3290 }, { "epoch": 0.38, "grad_norm": 1.788693198181352, "learning_rate": 0.0007167865014045365, "loss": 3.8342, "step": 3291 }, { "epoch": 0.38, "grad_norm": 0.9932367216165526, "learning_rate": 0.0007166191530206909, "loss": 3.8412, "step": 3292 }, { "epoch": 0.38, "grad_norm": 0.8956774875540682, "learning_rate": 0.0007164517747583625, "loss": 3.855, "step": 3293 }, { "epoch": 0.38, "grad_norm": 0.6829195804649829, "learning_rate": 0.0007162843666406376, "loss": 3.9458, "step": 3294 }, { "epoch": 0.38, "grad_norm": 1.9170558842659005, "learning_rate": 0.0007161169286906071, "loss": 4.1804, "step": 3295 }, { "epoch": 0.38, "grad_norm": 0.6570486058504044, "learning_rate": 0.0007159494609313659, "loss": 3.844, "step": 3296 }, { "epoch": 0.38, "grad_norm": 1.0473752430252925, "learning_rate": 0.0007157819633860129, "loss": 3.9199, "step": 3297 }, { "epoch": 0.38, "grad_norm": 1.0753472807792175, "learning_rate": 0.0007156144360776514, "loss": 3.9476, "step": 3298 }, { "epoch": 0.38, "grad_norm": 0.9380912241513922, "learning_rate": 0.0007154468790293882, "loss": 3.8748, "step": 3299 }, { "epoch": 0.38, "grad_norm": 1.2010428566544376, "learning_rate": 0.0007152792922643348, "loss": 3.9259, "step": 3300 }, { "epoch": 0.38, "grad_norm": 0.6988279729417768, "learning_rate": 0.0007151116758056066, "loss": 3.9066, "step": 3301 }, { "epoch": 0.38, "grad_norm": 0.8388728990379164, "learning_rate": 0.0007149440296763234, "loss": 4.0507, "step": 3302 }, { "epoch": 0.38, "grad_norm": 0.8019371090198734, "learning_rate": 0.0007147763538996083, "loss": 3.8999, "step": 3303 }, { "epoch": 0.38, "grad_norm": 0.9471624356171503, "learning_rate": 0.0007146086484985892, "loss": 3.9141, "step": 3304 }, { "epoch": 0.38, "grad_norm": 1.1452358131434026, "learning_rate": 0.000714440913496398, "loss": 4.191, "step": 3305 }, { "epoch": 0.38, "grad_norm": 0.9371214143854282, "learning_rate": 0.0007142731489161703, "loss": 3.9109, "step": 3306 }, { "epoch": 0.38, "grad_norm": 0.6616630453887908, "learning_rate": 0.0007141053547810459, "loss": 3.9203, "step": 3307 }, { "epoch": 0.38, "grad_norm": 0.8687846466080862, "learning_rate": 0.0007139375311141693, "loss": 3.8983, "step": 3308 }, { "epoch": 0.38, "grad_norm": 1.9055585315749395, "learning_rate": 0.0007137696779386883, "loss": 3.8551, "step": 3309 }, { "epoch": 0.38, "grad_norm": 0.7579768233824096, "learning_rate": 0.0007136017952777549, "loss": 4.0356, "step": 3310 }, { "epoch": 0.38, "grad_norm": 1.8756032170308423, "learning_rate": 0.0007134338831545257, "loss": 3.9017, "step": 3311 }, { "epoch": 0.38, "grad_norm": 0.8821428690392539, "learning_rate": 0.0007132659415921605, "loss": 3.9252, "step": 3312 }, { "epoch": 0.38, "grad_norm": 1.0928928496973818, "learning_rate": 0.0007130979706138241, "loss": 4.1134, "step": 3313 }, { "epoch": 0.38, "grad_norm": 0.7697373005142623, "learning_rate": 0.0007129299702426845, "loss": 3.9155, "step": 3314 }, { "epoch": 0.38, "grad_norm": 0.8545652628845798, "learning_rate": 0.0007127619405019143, "loss": 3.9865, "step": 3315 }, { "epoch": 0.38, "grad_norm": 0.7287345355933064, "learning_rate": 0.0007125938814146901, "loss": 4.0573, "step": 3316 }, { "epoch": 0.38, "grad_norm": 0.7125829704126797, "learning_rate": 0.0007124257930041924, "loss": 4.1419, "step": 3317 }, { "epoch": 0.38, "grad_norm": 0.800060692560973, "learning_rate": 0.0007122576752936058, "loss": 3.7792, "step": 3318 }, { "epoch": 0.38, "grad_norm": 1.1431511600957853, "learning_rate": 0.0007120895283061187, "loss": 3.9404, "step": 3319 }, { "epoch": 0.38, "grad_norm": 0.8501775588965105, "learning_rate": 0.000711921352064924, "loss": 3.9759, "step": 3320 }, { "epoch": 0.38, "grad_norm": 0.7237384285426776, "learning_rate": 0.0007117531465932185, "loss": 3.7819, "step": 3321 }, { "epoch": 0.38, "grad_norm": 1.1160297756541127, "learning_rate": 0.0007115849119142026, "loss": 3.7921, "step": 3322 }, { "epoch": 0.38, "grad_norm": 0.8153530887308467, "learning_rate": 0.0007114166480510815, "loss": 4.1012, "step": 3323 }, { "epoch": 0.38, "grad_norm": 0.8529110329044649, "learning_rate": 0.0007112483550270639, "loss": 3.9677, "step": 3324 }, { "epoch": 0.38, "grad_norm": 1.0506613711002666, "learning_rate": 0.000711080032865362, "loss": 4.1987, "step": 3325 }, { "epoch": 0.38, "grad_norm": 0.9112653570946848, "learning_rate": 0.0007109116815891936, "loss": 3.9459, "step": 3326 }, { "epoch": 0.38, "grad_norm": 0.832296854179136, "learning_rate": 0.0007107433012217788, "loss": 4.0432, "step": 3327 }, { "epoch": 0.38, "grad_norm": 0.7344208852668241, "learning_rate": 0.0007105748917863427, "loss": 3.7254, "step": 3328 }, { "epoch": 0.38, "grad_norm": 1.0223196712375513, "learning_rate": 0.0007104064533061144, "loss": 3.9986, "step": 3329 }, { "epoch": 0.38, "grad_norm": 0.6759991402636514, "learning_rate": 0.0007102379858043264, "loss": 3.7802, "step": 3330 }, { "epoch": 0.38, "grad_norm": 0.766145768353875, "learning_rate": 0.0007100694893042159, "loss": 3.7828, "step": 3331 }, { "epoch": 0.38, "grad_norm": 1.0086813540587627, "learning_rate": 0.0007099009638290235, "loss": 4.0193, "step": 3332 }, { "epoch": 0.38, "grad_norm": 0.7778333998551895, "learning_rate": 0.0007097324094019943, "loss": 3.8891, "step": 3333 }, { "epoch": 0.38, "grad_norm": 0.7052453764294455, "learning_rate": 0.000709563826046377, "loss": 4.0414, "step": 3334 }, { "epoch": 0.38, "grad_norm": 1.1315782498395228, "learning_rate": 0.0007093952137854247, "loss": 3.9061, "step": 3335 }, { "epoch": 0.38, "grad_norm": 2.8221283340917593, "learning_rate": 0.0007092265726423941, "loss": 3.9325, "step": 3336 }, { "epoch": 0.38, "grad_norm": 0.8236133825317398, "learning_rate": 0.0007090579026405458, "loss": 3.9009, "step": 3337 }, { "epoch": 0.38, "grad_norm": 0.6795041902059711, "learning_rate": 0.0007088892038031449, "loss": 3.6247, "step": 3338 }, { "epoch": 0.38, "grad_norm": 0.817942846237383, "learning_rate": 0.0007087204761534603, "loss": 3.9833, "step": 3339 }, { "epoch": 0.38, "grad_norm": 0.8733806694411126, "learning_rate": 0.0007085517197147645, "loss": 3.8855, "step": 3340 }, { "epoch": 0.38, "grad_norm": 1.04057477510719, "learning_rate": 0.0007083829345103343, "loss": 4.07, "step": 3341 }, { "epoch": 0.38, "grad_norm": 0.9759100619216993, "learning_rate": 0.0007082141205634505, "loss": 3.7606, "step": 3342 }, { "epoch": 0.38, "grad_norm": 1.2224729359711262, "learning_rate": 0.0007080452778973976, "loss": 3.8417, "step": 3343 }, { "epoch": 0.38, "grad_norm": 0.9532101639038549, "learning_rate": 0.0007078764065354643, "loss": 3.7539, "step": 3344 }, { "epoch": 0.38, "grad_norm": 2.0004859769641143, "learning_rate": 0.0007077075065009433, "loss": 3.9559, "step": 3345 }, { "epoch": 0.38, "grad_norm": 1.7698764399583478, "learning_rate": 0.000707538577817131, "loss": 3.9715, "step": 3346 }, { "epoch": 0.38, "grad_norm": 0.7164515040632928, "learning_rate": 0.0007073696205073278, "loss": 3.7503, "step": 3347 }, { "epoch": 0.38, "grad_norm": 0.9834905946086077, "learning_rate": 0.0007072006345948385, "loss": 3.9598, "step": 3348 }, { "epoch": 0.38, "grad_norm": 1.181941722338458, "learning_rate": 0.0007070316201029711, "loss": 3.7085, "step": 3349 }, { "epoch": 0.38, "grad_norm": 0.7297186076588013, "learning_rate": 0.0007068625770550381, "loss": 3.9253, "step": 3350 }, { "epoch": 0.38, "grad_norm": 0.7134133000782996, "learning_rate": 0.0007066935054743559, "loss": 3.9547, "step": 3351 }, { "epoch": 0.38, "grad_norm": 0.7427186559297575, "learning_rate": 0.0007065244053842444, "loss": 3.7245, "step": 3352 }, { "epoch": 0.38, "grad_norm": 0.8390218991412183, "learning_rate": 0.0007063552768080279, "loss": 3.8678, "step": 3353 }, { "epoch": 0.38, "grad_norm": 0.9027806846375394, "learning_rate": 0.0007061861197690347, "loss": 3.7547, "step": 3354 }, { "epoch": 0.38, "grad_norm": 0.7808471782868858, "learning_rate": 0.0007060169342905962, "loss": 3.9217, "step": 3355 }, { "epoch": 0.38, "grad_norm": 1.4682171050198496, "learning_rate": 0.0007058477203960488, "loss": 3.7175, "step": 3356 }, { "epoch": 0.38, "grad_norm": 0.7470543101518838, "learning_rate": 0.0007056784781087322, "loss": 3.9908, "step": 3357 }, { "epoch": 0.39, "grad_norm": 0.7514264045517569, "learning_rate": 0.0007055092074519903, "loss": 3.6495, "step": 3358 }, { "epoch": 0.39, "grad_norm": 0.7643381889110071, "learning_rate": 0.0007053399084491703, "loss": 3.9456, "step": 3359 }, { "epoch": 0.39, "grad_norm": 1.1485988132656653, "learning_rate": 0.0007051705811236241, "loss": 3.7763, "step": 3360 }, { "epoch": 0.39, "grad_norm": 0.6991314395819305, "learning_rate": 0.0007050012254987073, "loss": 3.8523, "step": 3361 }, { "epoch": 0.39, "grad_norm": 0.7390863466630749, "learning_rate": 0.000704831841597779, "loss": 4.0817, "step": 3362 }, { "epoch": 0.39, "grad_norm": 0.9616164635094651, "learning_rate": 0.0007046624294442026, "loss": 3.8319, "step": 3363 }, { "epoch": 0.39, "grad_norm": 0.9278848723190537, "learning_rate": 0.0007044929890613454, "loss": 4.1165, "step": 3364 }, { "epoch": 0.39, "grad_norm": 1.1035003062819806, "learning_rate": 0.0007043235204725783, "loss": 3.9039, "step": 3365 }, { "epoch": 0.39, "grad_norm": 1.0113398390257, "learning_rate": 0.0007041540237012762, "loss": 3.9528, "step": 3366 }, { "epoch": 0.39, "grad_norm": 0.707624380750532, "learning_rate": 0.0007039844987708182, "loss": 3.9872, "step": 3367 }, { "epoch": 0.39, "grad_norm": 0.9039384999302891, "learning_rate": 0.0007038149457045868, "loss": 4.098, "step": 3368 }, { "epoch": 0.39, "grad_norm": 1.2469389927410115, "learning_rate": 0.0007036453645259688, "loss": 3.8255, "step": 3369 }, { "epoch": 0.39, "grad_norm": 1.5144387905060115, "learning_rate": 0.0007034757552583545, "loss": 3.9786, "step": 3370 }, { "epoch": 0.39, "grad_norm": 0.6662324251841486, "learning_rate": 0.0007033061179251385, "loss": 3.9877, "step": 3371 }, { "epoch": 0.39, "grad_norm": 0.8522015594496838, "learning_rate": 0.0007031364525497187, "loss": 4.0949, "step": 3372 }, { "epoch": 0.39, "grad_norm": 0.9042613339198726, "learning_rate": 0.0007029667591554975, "loss": 3.6223, "step": 3373 }, { "epoch": 0.39, "grad_norm": 0.7799163460096716, "learning_rate": 0.0007027970377658809, "loss": 3.84, "step": 3374 }, { "epoch": 0.39, "grad_norm": 0.715549179249603, "learning_rate": 0.0007026272884042784, "loss": 3.7601, "step": 3375 }, { "epoch": 0.39, "grad_norm": 0.6484329416817454, "learning_rate": 0.0007024575110941041, "loss": 3.9266, "step": 3376 }, { "epoch": 0.39, "grad_norm": 1.457995681702858, "learning_rate": 0.0007022877058587751, "loss": 4.0635, "step": 3377 }, { "epoch": 0.39, "grad_norm": 1.0767536446029515, "learning_rate": 0.0007021178727217131, "loss": 4.0046, "step": 3378 }, { "epoch": 0.39, "grad_norm": 0.9525848377420545, "learning_rate": 0.0007019480117063433, "loss": 3.7873, "step": 3379 }, { "epoch": 0.39, "grad_norm": 0.7259215653212375, "learning_rate": 0.0007017781228360948, "loss": 3.9383, "step": 3380 }, { "epoch": 0.39, "grad_norm": 1.7891362585035462, "learning_rate": 0.0007016082061344005, "loss": 3.8855, "step": 3381 }, { "epoch": 0.39, "grad_norm": 0.9148483620225455, "learning_rate": 0.000701438261624697, "loss": 4.091, "step": 3382 }, { "epoch": 0.39, "grad_norm": 0.7313078105202896, "learning_rate": 0.0007012682893304254, "loss": 4.098, "step": 3383 }, { "epoch": 0.39, "grad_norm": 0.7602219356934325, "learning_rate": 0.0007010982892750296, "loss": 3.6999, "step": 3384 }, { "epoch": 0.39, "grad_norm": 1.2097621623685793, "learning_rate": 0.0007009282614819581, "loss": 3.9788, "step": 3385 }, { "epoch": 0.39, "grad_norm": 2.6795500186652257, "learning_rate": 0.000700758205974663, "loss": 3.9291, "step": 3386 }, { "epoch": 0.39, "grad_norm": 0.9356554033919326, "learning_rate": 0.0007005881227766001, "loss": 3.9079, "step": 3387 }, { "epoch": 0.39, "grad_norm": 1.8288442579933504, "learning_rate": 0.0007004180119112293, "loss": 3.9131, "step": 3388 }, { "epoch": 0.39, "grad_norm": 0.7640551236842704, "learning_rate": 0.0007002478734020141, "loss": 4.001, "step": 3389 }, { "epoch": 0.39, "grad_norm": 0.7254309474148272, "learning_rate": 0.0007000777072724218, "loss": 3.8506, "step": 3390 }, { "epoch": 0.39, "grad_norm": 1.034132164606612, "learning_rate": 0.0006999075135459235, "loss": 3.8911, "step": 3391 }, { "epoch": 0.39, "grad_norm": 0.8745157101684375, "learning_rate": 0.0006997372922459944, "loss": 3.8736, "step": 3392 }, { "epoch": 0.39, "grad_norm": 0.7873742476945783, "learning_rate": 0.0006995670433961132, "loss": 4.162, "step": 3393 }, { "epoch": 0.39, "grad_norm": 0.7638642226764357, "learning_rate": 0.0006993967670197624, "loss": 3.8111, "step": 3394 }, { "epoch": 0.39, "grad_norm": 0.7022687353517552, "learning_rate": 0.0006992264631404284, "loss": 3.7488, "step": 3395 }, { "epoch": 0.39, "grad_norm": 0.7629950217627116, "learning_rate": 0.0006990561317816016, "loss": 3.8693, "step": 3396 }, { "epoch": 0.39, "grad_norm": 2.254579219178182, "learning_rate": 0.0006988857729667754, "loss": 3.9149, "step": 3397 }, { "epoch": 0.39, "grad_norm": 1.1486190289540161, "learning_rate": 0.0006987153867194484, "loss": 3.7692, "step": 3398 }, { "epoch": 0.39, "grad_norm": 0.7182630399968086, "learning_rate": 0.0006985449730631215, "loss": 3.6708, "step": 3399 }, { "epoch": 0.39, "grad_norm": 0.6987208410433455, "learning_rate": 0.0006983745320212998, "loss": 3.8361, "step": 3400 }, { "epoch": 0.39, "grad_norm": 0.7273830149274815, "learning_rate": 0.0006982040636174932, "loss": 3.925, "step": 3401 }, { "epoch": 0.39, "grad_norm": 0.7442088239378711, "learning_rate": 0.0006980335678752141, "loss": 3.7854, "step": 3402 }, { "epoch": 0.39, "grad_norm": 1.2861577792209369, "learning_rate": 0.000697863044817979, "loss": 4.0878, "step": 3403 }, { "epoch": 0.39, "grad_norm": 1.1421858691047573, "learning_rate": 0.0006976924944693086, "loss": 4.2397, "step": 3404 }, { "epoch": 0.39, "grad_norm": 1.4935146340052725, "learning_rate": 0.0006975219168527269, "loss": 3.8311, "step": 3405 }, { "epoch": 0.39, "grad_norm": 0.9501004866473197, "learning_rate": 0.000697351311991762, "loss": 3.8277, "step": 3406 }, { "epoch": 0.39, "grad_norm": 0.8083020767559016, "learning_rate": 0.0006971806799099452, "loss": 3.9391, "step": 3407 }, { "epoch": 0.39, "grad_norm": 6.698811712938167, "learning_rate": 0.0006970100206308126, "loss": 3.8411, "step": 3408 }, { "epoch": 0.39, "grad_norm": 2.6763867761628264, "learning_rate": 0.0006968393341779027, "loss": 3.9007, "step": 3409 }, { "epoch": 0.39, "grad_norm": 1.2140361430008328, "learning_rate": 0.0006966686205747588, "loss": 4.1517, "step": 3410 }, { "epoch": 0.39, "grad_norm": 0.6823240883620164, "learning_rate": 0.0006964978798449276, "loss": 3.751, "step": 3411 }, { "epoch": 0.39, "grad_norm": 0.9913836087051249, "learning_rate": 0.0006963271120119594, "loss": 3.9388, "step": 3412 }, { "epoch": 0.39, "grad_norm": 0.6459359711387362, "learning_rate": 0.0006961563170994085, "loss": 4.0994, "step": 3413 }, { "epoch": 0.39, "grad_norm": 1.2890082537160297, "learning_rate": 0.0006959854951308328, "loss": 4.0524, "step": 3414 }, { "epoch": 0.39, "grad_norm": 2.423254101763184, "learning_rate": 0.0006958146461297938, "loss": 3.8892, "step": 3415 }, { "epoch": 0.39, "grad_norm": 0.9413679970961099, "learning_rate": 0.0006956437701198568, "loss": 3.8296, "step": 3416 }, { "epoch": 0.39, "grad_norm": 1.6274652264326335, "learning_rate": 0.0006954728671245911, "loss": 3.918, "step": 3417 }, { "epoch": 0.39, "grad_norm": 0.790044901809993, "learning_rate": 0.0006953019371675695, "loss": 4.0129, "step": 3418 }, { "epoch": 0.39, "grad_norm": 0.9676665528790502, "learning_rate": 0.0006951309802723685, "loss": 4.0326, "step": 3419 }, { "epoch": 0.39, "grad_norm": 1.0504257146461808, "learning_rate": 0.0006949599964625682, "loss": 4.094, "step": 3420 }, { "epoch": 0.39, "grad_norm": 0.84670323952287, "learning_rate": 0.0006947889857617526, "loss": 3.8495, "step": 3421 }, { "epoch": 0.39, "grad_norm": 1.1371950590055073, "learning_rate": 0.0006946179481935095, "loss": 3.8372, "step": 3422 }, { "epoch": 0.39, "grad_norm": 1.3002525669743776, "learning_rate": 0.0006944468837814302, "loss": 3.8715, "step": 3423 }, { "epoch": 0.39, "grad_norm": 0.7729180758493696, "learning_rate": 0.00069427579254911, "loss": 3.9905, "step": 3424 }, { "epoch": 0.39, "grad_norm": 0.7884554919182161, "learning_rate": 0.0006941046745201472, "loss": 3.847, "step": 3425 }, { "epoch": 0.39, "grad_norm": 0.6488700871580563, "learning_rate": 0.0006939335297181447, "loss": 3.8155, "step": 3426 }, { "epoch": 0.39, "grad_norm": 0.7503666562702941, "learning_rate": 0.0006937623581667082, "loss": 3.9625, "step": 3427 }, { "epoch": 0.39, "grad_norm": 1.0876049959583272, "learning_rate": 0.000693591159889448, "loss": 3.8521, "step": 3428 }, { "epoch": 0.39, "grad_norm": 0.8152498626467093, "learning_rate": 0.0006934199349099775, "loss": 3.8991, "step": 3429 }, { "epoch": 0.39, "grad_norm": 0.6488536361743054, "learning_rate": 0.0006932486832519139, "loss": 3.7437, "step": 3430 }, { "epoch": 0.39, "grad_norm": 1.0128035101805715, "learning_rate": 0.000693077404938878, "loss": 4.0108, "step": 3431 }, { "epoch": 0.39, "grad_norm": 3.0846318157920387, "learning_rate": 0.0006929060999944945, "loss": 3.8975, "step": 3432 }, { "epoch": 0.39, "grad_norm": 0.6921023069389945, "learning_rate": 0.0006927347684423918, "loss": 3.8839, "step": 3433 }, { "epoch": 0.39, "grad_norm": 0.8972886842731401, "learning_rate": 0.0006925634103062015, "loss": 3.8562, "step": 3434 }, { "epoch": 0.39, "grad_norm": 1.4270967331594473, "learning_rate": 0.0006923920256095594, "loss": 4.0701, "step": 3435 }, { "epoch": 0.39, "grad_norm": 1.8406603959717833, "learning_rate": 0.0006922206143761047, "loss": 4.0348, "step": 3436 }, { "epoch": 0.39, "grad_norm": 0.6483785388209308, "learning_rate": 0.0006920491766294803, "loss": 3.9583, "step": 3437 }, { "epoch": 0.39, "grad_norm": 0.8027818586427752, "learning_rate": 0.0006918777123933326, "loss": 3.6985, "step": 3438 }, { "epoch": 0.39, "grad_norm": 0.7881730499784737, "learning_rate": 0.0006917062216913123, "loss": 4.0696, "step": 3439 }, { "epoch": 0.39, "grad_norm": 1.4084361995339105, "learning_rate": 0.0006915347045470728, "loss": 4.0109, "step": 3440 }, { "epoch": 0.39, "grad_norm": 0.8269607185868392, "learning_rate": 0.0006913631609842718, "loss": 3.7854, "step": 3441 }, { "epoch": 0.39, "grad_norm": 1.3941232174974072, "learning_rate": 0.0006911915910265703, "loss": 4.0964, "step": 3442 }, { "epoch": 0.39, "grad_norm": 0.7766374229092557, "learning_rate": 0.0006910199946976337, "loss": 3.8728, "step": 3443 }, { "epoch": 0.39, "grad_norm": 0.7633863529134473, "learning_rate": 0.0006908483720211296, "loss": 4.1946, "step": 3444 }, { "epoch": 0.4, "grad_norm": 1.6290155697136393, "learning_rate": 0.0006906767230207306, "loss": 4.136, "step": 3445 }, { "epoch": 0.4, "grad_norm": 1.2704090995992519, "learning_rate": 0.0006905050477201124, "loss": 4.0953, "step": 3446 }, { "epoch": 0.4, "grad_norm": 1.2654324771121526, "learning_rate": 0.0006903333461429539, "loss": 3.798, "step": 3447 }, { "epoch": 0.4, "grad_norm": 0.9909990827320652, "learning_rate": 0.0006901616183129386, "loss": 4.1468, "step": 3448 }, { "epoch": 0.4, "grad_norm": 0.749643808058182, "learning_rate": 0.0006899898642537531, "loss": 3.882, "step": 3449 }, { "epoch": 0.4, "grad_norm": 0.7403367971173949, "learning_rate": 0.000689818083989087, "loss": 3.9016, "step": 3450 }, { "epoch": 0.4, "grad_norm": 0.8388187632599542, "learning_rate": 0.0006896462775426346, "loss": 3.9837, "step": 3451 }, { "epoch": 0.4, "grad_norm": 0.7823853264195315, "learning_rate": 0.0006894744449380932, "loss": 3.9834, "step": 3452 }, { "epoch": 0.4, "grad_norm": 0.6603462704373079, "learning_rate": 0.0006893025861991639, "loss": 4.0045, "step": 3453 }, { "epoch": 0.4, "grad_norm": 1.0499252507313506, "learning_rate": 0.0006891307013495513, "loss": 3.9849, "step": 3454 }, { "epoch": 0.4, "grad_norm": 0.8533183408450671, "learning_rate": 0.0006889587904129634, "loss": 3.9691, "step": 3455 }, { "epoch": 0.4, "grad_norm": 0.7484177997356751, "learning_rate": 0.0006887868534131124, "loss": 3.8457, "step": 3456 }, { "epoch": 0.4, "grad_norm": 0.7316877089455902, "learning_rate": 0.0006886148903737135, "loss": 3.7713, "step": 3457 }, { "epoch": 0.4, "grad_norm": 1.0522430678634722, "learning_rate": 0.0006884429013184858, "loss": 3.8651, "step": 3458 }, { "epoch": 0.4, "grad_norm": 0.842635263200612, "learning_rate": 0.000688270886271152, "loss": 3.9971, "step": 3459 }, { "epoch": 0.4, "grad_norm": 0.6470647164589675, "learning_rate": 0.0006880988452554382, "loss": 3.9114, "step": 3460 }, { "epoch": 0.4, "grad_norm": 1.5554346801592385, "learning_rate": 0.0006879267782950742, "loss": 3.9231, "step": 3461 }, { "epoch": 0.4, "grad_norm": 0.701080607529182, "learning_rate": 0.0006877546854137933, "loss": 3.9852, "step": 3462 }, { "epoch": 0.4, "grad_norm": 0.8180322478135571, "learning_rate": 0.0006875825666353324, "loss": 3.699, "step": 3463 }, { "epoch": 0.4, "grad_norm": 1.0545858703366515, "learning_rate": 0.0006874104219834322, "loss": 4.0429, "step": 3464 }, { "epoch": 0.4, "grad_norm": 0.7400692625762085, "learning_rate": 0.0006872382514818365, "loss": 3.9042, "step": 3465 }, { "epoch": 0.4, "grad_norm": 1.1285625795042549, "learning_rate": 0.0006870660551542932, "loss": 3.9184, "step": 3466 }, { "epoch": 0.4, "grad_norm": 0.8076321339938585, "learning_rate": 0.0006868938330245534, "loss": 3.8364, "step": 3467 }, { "epoch": 0.4, "grad_norm": 1.0387469566415193, "learning_rate": 0.000686721585116372, "loss": 4.115, "step": 3468 }, { "epoch": 0.4, "grad_norm": 0.7695249201747901, "learning_rate": 0.0006865493114535068, "loss": 3.9014, "step": 3469 }, { "epoch": 0.4, "grad_norm": 1.0867002076568821, "learning_rate": 0.0006863770120597204, "loss": 3.8513, "step": 3470 }, { "epoch": 0.4, "grad_norm": 0.7378111455454898, "learning_rate": 0.0006862046869587777, "loss": 3.7228, "step": 3471 }, { "epoch": 0.4, "grad_norm": 1.0666297914218206, "learning_rate": 0.0006860323361744477, "loss": 3.9952, "step": 3472 }, { "epoch": 0.4, "grad_norm": 1.1008295327368602, "learning_rate": 0.0006858599597305033, "loss": 3.9428, "step": 3473 }, { "epoch": 0.4, "grad_norm": 1.2934369093790543, "learning_rate": 0.0006856875576507201, "loss": 3.9347, "step": 3474 }, { "epoch": 0.4, "grad_norm": 0.7241660390474685, "learning_rate": 0.0006855151299588778, "loss": 4.0554, "step": 3475 }, { "epoch": 0.4, "grad_norm": 1.6380365928392744, "learning_rate": 0.0006853426766787597, "loss": 3.6894, "step": 3476 }, { "epoch": 0.4, "grad_norm": 0.9102016312376878, "learning_rate": 0.0006851701978341522, "loss": 3.9171, "step": 3477 }, { "epoch": 0.4, "grad_norm": 0.9029902352372559, "learning_rate": 0.0006849976934488456, "loss": 3.7216, "step": 3478 }, { "epoch": 0.4, "grad_norm": 0.7921749643492633, "learning_rate": 0.0006848251635466336, "loss": 4.0616, "step": 3479 }, { "epoch": 0.4, "grad_norm": 1.0574798562989824, "learning_rate": 0.0006846526081513134, "loss": 3.879, "step": 3480 }, { "epoch": 0.4, "grad_norm": 1.011119152752429, "learning_rate": 0.0006844800272866856, "loss": 3.9388, "step": 3481 }, { "epoch": 0.4, "grad_norm": 0.9921451405798993, "learning_rate": 0.0006843074209765545, "loss": 3.9541, "step": 3482 }, { "epoch": 0.4, "grad_norm": 0.9820282798971539, "learning_rate": 0.0006841347892447281, "loss": 4.085, "step": 3483 }, { "epoch": 0.4, "grad_norm": 0.6332435155135141, "learning_rate": 0.0006839621321150174, "loss": 3.8381, "step": 3484 }, { "epoch": 0.4, "grad_norm": 0.7204132105337496, "learning_rate": 0.0006837894496112371, "loss": 3.8958, "step": 3485 }, { "epoch": 0.4, "grad_norm": 0.7463569060463991, "learning_rate": 0.0006836167417572056, "loss": 4.0131, "step": 3486 }, { "epoch": 0.4, "grad_norm": 0.7812333462119307, "learning_rate": 0.0006834440085767447, "loss": 4.0659, "step": 3487 }, { "epoch": 0.4, "grad_norm": 1.0239075048241486, "learning_rate": 0.0006832712500936795, "loss": 3.9459, "step": 3488 }, { "epoch": 0.4, "grad_norm": 0.843402038885923, "learning_rate": 0.0006830984663318391, "loss": 3.8898, "step": 3489 }, { "epoch": 0.4, "grad_norm": 0.8033088065077526, "learning_rate": 0.0006829256573150551, "loss": 3.8685, "step": 3490 }, { "epoch": 0.4, "grad_norm": 0.7152013760506205, "learning_rate": 0.0006827528230671636, "loss": 4.1323, "step": 3491 }, { "epoch": 0.4, "grad_norm": 0.7290283489152473, "learning_rate": 0.0006825799636120038, "loss": 3.7115, "step": 3492 }, { "epoch": 0.4, "grad_norm": 0.6891665300299957, "learning_rate": 0.0006824070789734184, "loss": 3.8302, "step": 3493 }, { "epoch": 0.4, "grad_norm": 0.6718413449229536, "learning_rate": 0.0006822341691752532, "loss": 3.6873, "step": 3494 }, { "epoch": 0.4, "grad_norm": 0.9035927182864095, "learning_rate": 0.0006820612342413583, "loss": 4.0226, "step": 3495 }, { "epoch": 0.4, "grad_norm": 0.6774448424190441, "learning_rate": 0.0006818882741955866, "loss": 3.8371, "step": 3496 }, { "epoch": 0.4, "grad_norm": 0.9980535651122074, "learning_rate": 0.0006817152890617943, "loss": 3.9907, "step": 3497 }, { "epoch": 0.4, "grad_norm": 1.3390681167080065, "learning_rate": 0.0006815422788638418, "loss": 4.0254, "step": 3498 }, { "epoch": 0.4, "grad_norm": 1.0711592852563288, "learning_rate": 0.0006813692436255926, "loss": 3.9534, "step": 3499 }, { "epoch": 0.4, "grad_norm": 0.797292642291846, "learning_rate": 0.0006811961833709132, "loss": 3.898, "step": 3500 }, { "epoch": 0.4, "grad_norm": 0.9858966831795786, "learning_rate": 0.0006810230981236743, "loss": 3.9352, "step": 3501 }, { "epoch": 0.4, "grad_norm": 0.8387059106614584, "learning_rate": 0.0006808499879077496, "loss": 4.1777, "step": 3502 }, { "epoch": 0.4, "grad_norm": 0.7609113558647651, "learning_rate": 0.0006806768527470165, "loss": 3.9475, "step": 3503 }, { "epoch": 0.4, "grad_norm": 0.9282326349249027, "learning_rate": 0.0006805036926653556, "loss": 4.0016, "step": 3504 }, { "epoch": 0.4, "grad_norm": 0.9815411734506811, "learning_rate": 0.0006803305076866509, "loss": 4.0452, "step": 3505 }, { "epoch": 0.4, "grad_norm": 0.9940812785229797, "learning_rate": 0.0006801572978347901, "loss": 4.1654, "step": 3506 }, { "epoch": 0.4, "grad_norm": 0.9731790585273556, "learning_rate": 0.0006799840631336642, "loss": 3.9056, "step": 3507 }, { "epoch": 0.4, "grad_norm": 0.8039046280725042, "learning_rate": 0.0006798108036071677, "loss": 4.0764, "step": 3508 }, { "epoch": 0.4, "grad_norm": 0.8202666346749958, "learning_rate": 0.0006796375192791982, "loss": 3.9441, "step": 3509 }, { "epoch": 0.4, "grad_norm": 0.6887662478470483, "learning_rate": 0.000679464210173657, "loss": 3.8861, "step": 3510 }, { "epoch": 0.4, "grad_norm": 1.0082304270421656, "learning_rate": 0.0006792908763144492, "loss": 3.8607, "step": 3511 }, { "epoch": 0.4, "grad_norm": 1.0473323836020187, "learning_rate": 0.0006791175177254825, "loss": 3.936, "step": 3512 }, { "epoch": 0.4, "grad_norm": 1.1986392196012854, "learning_rate": 0.0006789441344306684, "loss": 3.8672, "step": 3513 }, { "epoch": 0.4, "grad_norm": 0.8703538059875083, "learning_rate": 0.0006787707264539222, "loss": 3.7305, "step": 3514 }, { "epoch": 0.4, "grad_norm": 0.8870040300060339, "learning_rate": 0.0006785972938191617, "loss": 3.8769, "step": 3515 }, { "epoch": 0.4, "grad_norm": 1.112597726577681, "learning_rate": 0.0006784238365503089, "loss": 4.1307, "step": 3516 }, { "epoch": 0.4, "grad_norm": 0.7980253485856914, "learning_rate": 0.000678250354671289, "loss": 3.9224, "step": 3517 }, { "epoch": 0.4, "grad_norm": 0.8033843637443447, "learning_rate": 0.0006780768482060305, "loss": 3.9783, "step": 3518 }, { "epoch": 0.4, "grad_norm": 1.1140867404432595, "learning_rate": 0.0006779033171784652, "loss": 3.8248, "step": 3519 }, { "epoch": 0.4, "grad_norm": 0.6853604869731147, "learning_rate": 0.0006777297616125283, "loss": 3.7346, "step": 3520 }, { "epoch": 0.4, "grad_norm": 0.9058647993068147, "learning_rate": 0.0006775561815321589, "loss": 3.9134, "step": 3521 }, { "epoch": 0.4, "grad_norm": 0.7237692247364785, "learning_rate": 0.0006773825769612984, "loss": 3.9275, "step": 3522 }, { "epoch": 0.4, "grad_norm": 4.232269882999302, "learning_rate": 0.000677208947923893, "loss": 3.9909, "step": 3523 }, { "epoch": 0.4, "grad_norm": 0.7659789556729887, "learning_rate": 0.0006770352944438911, "loss": 4.1014, "step": 3524 }, { "epoch": 0.4, "grad_norm": 0.8642823439955702, "learning_rate": 0.0006768616165452447, "loss": 4.0442, "step": 3525 }, { "epoch": 0.4, "grad_norm": 1.7540440130219759, "learning_rate": 0.0006766879142519098, "loss": 4.0786, "step": 3526 }, { "epoch": 0.4, "grad_norm": 0.838257519465221, "learning_rate": 0.0006765141875878449, "loss": 3.8114, "step": 3527 }, { "epoch": 0.4, "grad_norm": 0.694682695398746, "learning_rate": 0.0006763404365770126, "loss": 3.7961, "step": 3528 }, { "epoch": 0.4, "grad_norm": 0.8843409812350062, "learning_rate": 0.0006761666612433786, "loss": 3.8591, "step": 3529 }, { "epoch": 0.4, "grad_norm": 0.8760827883300009, "learning_rate": 0.0006759928616109115, "loss": 3.9443, "step": 3530 }, { "epoch": 0.4, "grad_norm": 0.718162751463088, "learning_rate": 0.0006758190377035839, "loss": 3.7694, "step": 3531 }, { "epoch": 0.4, "grad_norm": 0.7542454611567001, "learning_rate": 0.0006756451895453715, "loss": 3.9392, "step": 3532 }, { "epoch": 0.41, "grad_norm": 0.9337734254787918, "learning_rate": 0.0006754713171602533, "loss": 3.7732, "step": 3533 }, { "epoch": 0.41, "grad_norm": 2.0898045365480518, "learning_rate": 0.0006752974205722117, "loss": 3.9051, "step": 3534 }, { "epoch": 0.41, "grad_norm": 1.0839891480315167, "learning_rate": 0.0006751234998052324, "loss": 4.0171, "step": 3535 }, { "epoch": 0.41, "grad_norm": 1.0839891480315167, "learning_rate": 0.0006751234998052324, "loss": 4.1755, "step": 3536 }, { "epoch": 0.41, "grad_norm": 0.9350549118256333, "learning_rate": 0.0006749495548833044, "loss": 3.6416, "step": 3537 }, { "epoch": 0.41, "grad_norm": 0.7765250517020164, "learning_rate": 0.0006747755858304203, "loss": 4.1252, "step": 3538 }, { "epoch": 0.41, "grad_norm": 0.8141208461592765, "learning_rate": 0.0006746015926705755, "loss": 3.858, "step": 3539 }, { "epoch": 0.41, "grad_norm": 0.9447318215713497, "learning_rate": 0.0006744275754277694, "loss": 3.9369, "step": 3540 }, { "epoch": 0.41, "grad_norm": 0.8102530724347842, "learning_rate": 0.0006742535341260038, "loss": 3.7406, "step": 3541 }, { "epoch": 0.41, "grad_norm": 0.8694864074254017, "learning_rate": 0.000674079468789285, "loss": 3.7441, "step": 3542 }, { "epoch": 0.41, "grad_norm": 1.3971699507829856, "learning_rate": 0.0006739053794416217, "loss": 3.8904, "step": 3543 }, { "epoch": 0.41, "grad_norm": 1.0143829728609426, "learning_rate": 0.0006737312661070263, "loss": 3.885, "step": 3544 }, { "epoch": 0.41, "grad_norm": 0.7966521077029844, "learning_rate": 0.0006735571288095144, "loss": 3.9936, "step": 3545 }, { "epoch": 0.41, "grad_norm": 0.8230559112466994, "learning_rate": 0.0006733829675731047, "loss": 3.7552, "step": 3546 }, { "epoch": 0.41, "grad_norm": 1.136183085205189, "learning_rate": 0.0006732087824218197, "loss": 3.8612, "step": 3547 }, { "epoch": 0.41, "grad_norm": 0.7633842509423904, "learning_rate": 0.0006730345733796847, "loss": 4.0291, "step": 3548 }, { "epoch": 0.41, "grad_norm": 2.7268972032723986, "learning_rate": 0.0006728603404707288, "loss": 3.9903, "step": 3549 }, { "epoch": 0.41, "grad_norm": 0.7746432780134294, "learning_rate": 0.0006726860837189839, "loss": 3.8263, "step": 3550 }, { "epoch": 0.41, "grad_norm": 0.9064380536348211, "learning_rate": 0.0006725118031484855, "loss": 3.9239, "step": 3551 }, { "epoch": 0.41, "grad_norm": 0.9286131729673138, "learning_rate": 0.0006723374987832722, "loss": 4.08, "step": 3552 }, { "epoch": 0.41, "grad_norm": 0.9697802558604689, "learning_rate": 0.000672163170647386, "loss": 4.1046, "step": 3553 }, { "epoch": 0.41, "grad_norm": 0.7003760425040427, "learning_rate": 0.0006719888187648721, "loss": 4.0419, "step": 3554 }, { "epoch": 0.41, "grad_norm": 1.2204156697810131, "learning_rate": 0.000671814443159779, "loss": 3.8157, "step": 3555 }, { "epoch": 0.41, "grad_norm": 0.8381101904679068, "learning_rate": 0.0006716400438561588, "loss": 4.3142, "step": 3556 }, { "epoch": 0.41, "grad_norm": 0.6231159222001623, "learning_rate": 0.000671465620878066, "loss": 4.1682, "step": 3557 }, { "epoch": 0.41, "grad_norm": 1.3752803975763683, "learning_rate": 0.0006712911742495593, "loss": 3.9779, "step": 3558 }, { "epoch": 0.41, "grad_norm": 1.5978996178080047, "learning_rate": 0.0006711167039947003, "loss": 3.9663, "step": 3559 }, { "epoch": 0.41, "grad_norm": 0.7122112028732169, "learning_rate": 0.0006709422101375537, "loss": 3.8424, "step": 3560 }, { "epoch": 0.41, "grad_norm": 0.8669749331298523, "learning_rate": 0.0006707676927021878, "loss": 4.0485, "step": 3561 }, { "epoch": 0.41, "grad_norm": 1.1669885183262103, "learning_rate": 0.0006705931517126738, "loss": 3.9763, "step": 3562 }, { "epoch": 0.41, "grad_norm": 0.6383938184887691, "learning_rate": 0.0006704185871930861, "loss": 3.8064, "step": 3563 }, { "epoch": 0.41, "grad_norm": 1.028108814476399, "learning_rate": 0.0006702439991675029, "loss": 3.7581, "step": 3564 }, { "epoch": 0.41, "grad_norm": 1.3011606877518336, "learning_rate": 0.0006700693876600052, "loss": 3.9909, "step": 3565 }, { "epoch": 0.41, "grad_norm": 0.8505085161711463, "learning_rate": 0.0006698947526946774, "loss": 3.9405, "step": 3566 }, { "epoch": 0.41, "grad_norm": 0.7080398353770477, "learning_rate": 0.0006697200942956068, "loss": 3.9288, "step": 3567 }, { "epoch": 0.41, "grad_norm": 0.8416727970758157, "learning_rate": 0.0006695454124868843, "loss": 3.8673, "step": 3568 }, { "epoch": 0.41, "grad_norm": 1.2223297553858314, "learning_rate": 0.0006693707072926041, "loss": 3.8075, "step": 3569 }, { "epoch": 0.41, "grad_norm": 0.8492243729109681, "learning_rate": 0.0006691959787368633, "loss": 3.8987, "step": 3570 }, { "epoch": 0.41, "grad_norm": 0.783652309163139, "learning_rate": 0.0006690212268437623, "loss": 3.8685, "step": 3571 }, { "epoch": 0.41, "grad_norm": 0.9325910289767202, "learning_rate": 0.000668846451637405, "loss": 3.8827, "step": 3572 }, { "epoch": 0.41, "grad_norm": 0.6080810619402465, "learning_rate": 0.0006686716531418981, "loss": 4.0698, "step": 3573 }, { "epoch": 0.41, "grad_norm": 1.109614608536991, "learning_rate": 0.000668496831381352, "loss": 4.2891, "step": 3574 }, { "epoch": 0.41, "grad_norm": 0.6380339786590021, "learning_rate": 0.0006683219863798797, "loss": 3.9442, "step": 3575 }, { "epoch": 0.41, "grad_norm": 2.7687087047645305, "learning_rate": 0.0006681471181615979, "loss": 3.7061, "step": 3576 }, { "epoch": 0.41, "grad_norm": 0.9707893781845968, "learning_rate": 0.0006679722267506265, "loss": 4.0632, "step": 3577 }, { "epoch": 0.41, "grad_norm": 0.8345750130667939, "learning_rate": 0.0006677973121710881, "loss": 3.857, "step": 3578 }, { "epoch": 0.41, "grad_norm": 0.6335529342032116, "learning_rate": 0.000667622374447109, "loss": 3.6911, "step": 3579 }, { "epoch": 0.41, "grad_norm": 0.7696052062229788, "learning_rate": 0.0006674474136028186, "loss": 3.9166, "step": 3580 }, { "epoch": 0.41, "grad_norm": 0.7424280516365559, "learning_rate": 0.0006672724296623492, "loss": 4.0122, "step": 3581 }, { "epoch": 0.41, "grad_norm": 0.9350963608893395, "learning_rate": 0.0006670974226498367, "loss": 3.7741, "step": 3582 }, { "epoch": 0.41, "grad_norm": 0.9588264447606708, "learning_rate": 0.0006669223925894199, "loss": 3.8034, "step": 3583 }, { "epoch": 0.41, "grad_norm": 0.7215632029193716, "learning_rate": 0.0006667473395052411, "loss": 3.9574, "step": 3584 }, { "epoch": 0.41, "grad_norm": 1.0455888026373705, "learning_rate": 0.000666572263421445, "loss": 4.034, "step": 3585 }, { "epoch": 0.41, "grad_norm": 0.7462321018172617, "learning_rate": 0.0006663971643621803, "loss": 4.0364, "step": 3586 }, { "epoch": 0.41, "grad_norm": 0.9918695724811575, "learning_rate": 0.0006662220423515987, "loss": 4.1729, "step": 3587 }, { "epoch": 0.41, "grad_norm": 0.7352466717369355, "learning_rate": 0.0006660468974138549, "loss": 3.9855, "step": 3588 }, { "epoch": 0.41, "grad_norm": 0.8119060401011864, "learning_rate": 0.0006658717295731067, "loss": 3.9365, "step": 3589 }, { "epoch": 0.41, "grad_norm": 0.7658410446020455, "learning_rate": 0.0006656965388535153, "loss": 3.7967, "step": 3590 }, { "epoch": 0.41, "grad_norm": 0.7075660031890818, "learning_rate": 0.0006655213252792446, "loss": 3.9751, "step": 3591 }, { "epoch": 0.41, "grad_norm": 0.8365927269682267, "learning_rate": 0.0006653460888744625, "loss": 3.8675, "step": 3592 }, { "epoch": 0.41, "grad_norm": 0.7729212861937325, "learning_rate": 0.000665170829663339, "loss": 3.8043, "step": 3593 }, { "epoch": 0.41, "grad_norm": 0.8846142590745467, "learning_rate": 0.000664995547670048, "loss": 3.7027, "step": 3594 }, { "epoch": 0.41, "grad_norm": 0.7374995723051665, "learning_rate": 0.0006648202429187664, "loss": 3.7515, "step": 3595 }, { "epoch": 0.41, "grad_norm": 0.8122231952790627, "learning_rate": 0.0006646449154336739, "loss": 3.7328, "step": 3596 }, { "epoch": 0.41, "grad_norm": 1.135153788434209, "learning_rate": 0.0006644695652389538, "loss": 4.0128, "step": 3597 }, { "epoch": 0.41, "grad_norm": 0.9605791259258104, "learning_rate": 0.0006642941923587923, "loss": 4.2211, "step": 3598 }, { "epoch": 0.41, "grad_norm": 0.9710386438884726, "learning_rate": 0.0006641187968173788, "loss": 3.8975, "step": 3599 }, { "epoch": 0.41, "grad_norm": 0.7499156331744677, "learning_rate": 0.0006639433786389054, "loss": 4.0966, "step": 3600 }, { "epoch": 0.41, "grad_norm": 1.2481071855138506, "learning_rate": 0.0006637679378475681, "loss": 3.992, "step": 3601 }, { "epoch": 0.41, "grad_norm": 1.0767641619613455, "learning_rate": 0.0006635924744675656, "loss": 3.9972, "step": 3602 }, { "epoch": 0.41, "grad_norm": 1.0117238249467249, "learning_rate": 0.0006634169885230994, "loss": 4.0576, "step": 3603 }, { "epoch": 0.41, "grad_norm": 0.9633423103886396, "learning_rate": 0.0006632414800383747, "loss": 3.6439, "step": 3604 }, { "epoch": 0.41, "grad_norm": 3.198673850933346, "learning_rate": 0.0006630659490375996, "loss": 3.898, "step": 3605 }, { "epoch": 0.41, "grad_norm": 0.7653104015991626, "learning_rate": 0.000662890395544985, "loss": 3.9705, "step": 3606 }, { "epoch": 0.41, "grad_norm": 1.1643562543901387, "learning_rate": 0.0006627148195847455, "loss": 4.0835, "step": 3607 }, { "epoch": 0.41, "grad_norm": 1.5839340084987905, "learning_rate": 0.000662539221181098, "loss": 3.9641, "step": 3608 }, { "epoch": 0.41, "grad_norm": 1.322600261285496, "learning_rate": 0.0006623636003582633, "loss": 4.1775, "step": 3609 }, { "epoch": 0.41, "grad_norm": 0.6986306076599722, "learning_rate": 0.0006621879571404649, "loss": 4.0461, "step": 3610 }, { "epoch": 0.41, "grad_norm": 0.8959403714705687, "learning_rate": 0.0006620122915519295, "loss": 4.2408, "step": 3611 }, { "epoch": 0.41, "grad_norm": 0.7422888938299663, "learning_rate": 0.0006618366036168867, "loss": 3.8729, "step": 3612 }, { "epoch": 0.41, "grad_norm": 0.7073567605154498, "learning_rate": 0.0006616608933595692, "loss": 3.9383, "step": 3613 }, { "epoch": 0.41, "grad_norm": 1.5767185829389347, "learning_rate": 0.0006614851608042131, "loss": 3.9365, "step": 3614 }, { "epoch": 0.41, "grad_norm": 0.9174775197815938, "learning_rate": 0.0006613094059750573, "loss": 3.7849, "step": 3615 }, { "epoch": 0.41, "grad_norm": 0.6752714413137972, "learning_rate": 0.0006611336288963436, "loss": 4.2065, "step": 3616 }, { "epoch": 0.41, "grad_norm": 3.349616899475959, "learning_rate": 0.0006609578295923174, "loss": 3.8994, "step": 3617 }, { "epoch": 0.41, "grad_norm": 0.7404579665417675, "learning_rate": 0.0006607820080872267, "loss": 3.9186, "step": 3618 }, { "epoch": 0.41, "grad_norm": 1.3375083931445144, "learning_rate": 0.0006606061644053227, "loss": 3.9066, "step": 3619 }, { "epoch": 0.42, "grad_norm": 1.1318201113656365, "learning_rate": 0.0006604302985708599, "loss": 3.7508, "step": 3620 }, { "epoch": 0.42, "grad_norm": 0.6458300055831522, "learning_rate": 0.0006602544106080955, "loss": 3.8776, "step": 3621 }, { "epoch": 0.42, "grad_norm": 0.7503899937692639, "learning_rate": 0.0006600785005412897, "loss": 3.8035, "step": 3622 }, { "epoch": 0.42, "grad_norm": 0.8237359573929451, "learning_rate": 0.0006599025683947062, "loss": 4.09, "step": 3623 }, { "epoch": 0.42, "grad_norm": 0.8176670601464088, "learning_rate": 0.0006597266141926115, "loss": 3.8048, "step": 3624 }, { "epoch": 0.42, "grad_norm": 0.9037075555997728, "learning_rate": 0.000659550637959275, "loss": 3.9426, "step": 3625 }, { "epoch": 0.42, "grad_norm": 0.9273429257949872, "learning_rate": 0.0006593746397189692, "loss": 3.8772, "step": 3626 }, { "epoch": 0.42, "grad_norm": 1.115855861943939, "learning_rate": 0.00065919861949597, "loss": 4.0285, "step": 3627 }, { "epoch": 0.42, "grad_norm": 0.7831998453127073, "learning_rate": 0.0006590225773145556, "loss": 4.0581, "step": 3628 }, { "epoch": 0.42, "grad_norm": 1.0154735427366068, "learning_rate": 0.000658846513199008, "loss": 3.8329, "step": 3629 }, { "epoch": 0.42, "grad_norm": 1.8200122780353563, "learning_rate": 0.0006586704271736119, "loss": 4.0292, "step": 3630 }, { "epoch": 0.42, "grad_norm": 0.9500465193257284, "learning_rate": 0.0006584943192626549, "loss": 3.9099, "step": 3631 }, { "epoch": 0.42, "grad_norm": 1.738394319450492, "learning_rate": 0.0006583181894904277, "loss": 3.9645, "step": 3632 }, { "epoch": 0.42, "grad_norm": 0.7457587357691314, "learning_rate": 0.000658142037881224, "loss": 3.8962, "step": 3633 }, { "epoch": 0.42, "grad_norm": 1.0183735978500776, "learning_rate": 0.0006579658644593407, "loss": 3.6121, "step": 3634 }, { "epoch": 0.42, "grad_norm": 2.598891179243258, "learning_rate": 0.0006577896692490775, "loss": 3.9492, "step": 3635 }, { "epoch": 0.42, "grad_norm": 0.9377435359257685, "learning_rate": 0.0006576134522747371, "loss": 4.0991, "step": 3636 }, { "epoch": 0.42, "grad_norm": 0.941476084885805, "learning_rate": 0.0006574372135606254, "loss": 3.7155, "step": 3637 }, { "epoch": 0.42, "grad_norm": 2.2129226137651785, "learning_rate": 0.0006572609531310511, "loss": 3.9932, "step": 3638 }, { "epoch": 0.42, "grad_norm": 0.7427693538391277, "learning_rate": 0.0006570846710103261, "loss": 3.9024, "step": 3639 }, { "epoch": 0.42, "grad_norm": 0.7166249094543058, "learning_rate": 0.0006569083672227649, "loss": 4.1948, "step": 3640 }, { "epoch": 0.42, "grad_norm": 0.7901155942162832, "learning_rate": 0.0006567320417926855, "loss": 3.9909, "step": 3641 }, { "epoch": 0.42, "grad_norm": 1.2330577538755232, "learning_rate": 0.0006565556947444085, "loss": 3.6988, "step": 3642 }, { "epoch": 0.42, "grad_norm": 1.8945454855793098, "learning_rate": 0.0006563793261022575, "loss": 4.2009, "step": 3643 }, { "epoch": 0.42, "grad_norm": 0.8049202705369144, "learning_rate": 0.0006562029358905593, "loss": 3.8642, "step": 3644 }, { "epoch": 0.42, "grad_norm": 0.9387421585216944, "learning_rate": 0.0006560265241336438, "loss": 3.8996, "step": 3645 }, { "epoch": 0.42, "grad_norm": 0.6984098183498278, "learning_rate": 0.0006558500908558434, "loss": 3.7463, "step": 3646 }, { "epoch": 0.42, "grad_norm": 0.7260655543750778, "learning_rate": 0.0006556736360814935, "loss": 3.9088, "step": 3647 }, { "epoch": 0.42, "grad_norm": 0.7695389419310944, "learning_rate": 0.000655497159834933, "loss": 3.7829, "step": 3648 }, { "epoch": 0.42, "grad_norm": 1.2991407045787722, "learning_rate": 0.0006553206621405037, "loss": 3.9126, "step": 3649 }, { "epoch": 0.42, "grad_norm": 0.6648531241479184, "learning_rate": 0.0006551441430225493, "loss": 3.936, "step": 3650 }, { "epoch": 0.42, "grad_norm": 0.7575269344239155, "learning_rate": 0.0006549676025054179, "loss": 4.0846, "step": 3651 }, { "epoch": 0.42, "grad_norm": 1.40428366571099, "learning_rate": 0.0006547910406134597, "loss": 4.0177, "step": 3652 }, { "epoch": 0.42, "grad_norm": 1.0422919899771146, "learning_rate": 0.000654614457371028, "loss": 3.9038, "step": 3653 }, { "epoch": 0.42, "grad_norm": 0.7659668945542222, "learning_rate": 0.000654437852802479, "loss": 3.9248, "step": 3654 }, { "epoch": 0.42, "grad_norm": 0.8503749190737011, "learning_rate": 0.0006542612269321722, "loss": 3.9363, "step": 3655 }, { "epoch": 0.42, "grad_norm": 1.9285882414545406, "learning_rate": 0.0006540845797844697, "loss": 4.0174, "step": 3656 }, { "epoch": 0.42, "grad_norm": 0.640956243397319, "learning_rate": 0.0006539079113837363, "loss": 3.7436, "step": 3657 }, { "epoch": 0.42, "grad_norm": 0.6712320486249768, "learning_rate": 0.0006537312217543404, "loss": 4.0883, "step": 3658 }, { "epoch": 0.42, "grad_norm": 0.6562870931953991, "learning_rate": 0.0006535545109206529, "loss": 3.6647, "step": 3659 }, { "epoch": 0.42, "grad_norm": 0.7337082723010244, "learning_rate": 0.0006533777789070475, "loss": 4.0601, "step": 3660 }, { "epoch": 0.42, "grad_norm": 0.6823843766648049, "learning_rate": 0.0006532010257379013, "loss": 4.1901, "step": 3661 }, { "epoch": 0.42, "grad_norm": 2.3567102604083834, "learning_rate": 0.0006530242514375938, "loss": 3.8741, "step": 3662 }, { "epoch": 0.42, "grad_norm": 1.531462696557364, "learning_rate": 0.0006528474560305076, "loss": 4.0853, "step": 3663 }, { "epoch": 0.42, "grad_norm": 0.7677654658001719, "learning_rate": 0.0006526706395410287, "loss": 3.6884, "step": 3664 }, { "epoch": 0.42, "grad_norm": 0.7222067408228138, "learning_rate": 0.000652493801993545, "loss": 3.834, "step": 3665 }, { "epoch": 0.42, "grad_norm": 0.8595382816261831, "learning_rate": 0.0006523169434124481, "loss": 4.0389, "step": 3666 }, { "epoch": 0.42, "grad_norm": 0.7844396687038182, "learning_rate": 0.0006521400638221324, "loss": 3.9972, "step": 3667 }, { "epoch": 0.42, "grad_norm": 0.6846347103949043, "learning_rate": 0.0006519631632469949, "loss": 3.5917, "step": 3668 }, { "epoch": 0.42, "grad_norm": 0.8717577305108276, "learning_rate": 0.0006517862417114356, "loss": 4.1146, "step": 3669 }, { "epoch": 0.42, "grad_norm": 0.6595895242736812, "learning_rate": 0.0006516092992398578, "loss": 3.8069, "step": 3670 }, { "epoch": 0.42, "grad_norm": 0.8086637976060025, "learning_rate": 0.000651432335856667, "loss": 3.8648, "step": 3671 }, { "epoch": 0.42, "grad_norm": 0.8844947274079333, "learning_rate": 0.0006512553515862718, "loss": 3.8609, "step": 3672 }, { "epoch": 0.42, "grad_norm": 0.6114866380458802, "learning_rate": 0.0006510783464530842, "loss": 3.7297, "step": 3673 }, { "epoch": 0.42, "grad_norm": 0.7619852283362972, "learning_rate": 0.0006509013204815188, "loss": 3.9021, "step": 3674 }, { "epoch": 0.42, "grad_norm": 0.632980241310844, "learning_rate": 0.0006507242736959923, "loss": 3.7546, "step": 3675 }, { "epoch": 0.42, "grad_norm": 0.7952368049339943, "learning_rate": 0.0006505472061209255, "loss": 3.9878, "step": 3676 }, { "epoch": 0.42, "grad_norm": 1.6286627041637103, "learning_rate": 0.0006503701177807413, "loss": 3.7858, "step": 3677 }, { "epoch": 0.42, "grad_norm": 0.746941231248338, "learning_rate": 0.0006501930086998656, "loss": 3.9441, "step": 3678 }, { "epoch": 0.42, "grad_norm": 0.7670845478209449, "learning_rate": 0.0006500158789027273, "loss": 3.8741, "step": 3679 }, { "epoch": 0.42, "grad_norm": 0.7634332746176549, "learning_rate": 0.0006498387284137582, "loss": 3.9943, "step": 3680 }, { "epoch": 0.42, "grad_norm": 0.8529586348059764, "learning_rate": 0.0006496615572573925, "loss": 3.6691, "step": 3681 }, { "epoch": 0.42, "grad_norm": 0.7260094474850411, "learning_rate": 0.0006494843654580678, "loss": 3.9286, "step": 3682 }, { "epoch": 0.42, "grad_norm": 0.8296010718600338, "learning_rate": 0.0006493071530402244, "loss": 3.9613, "step": 3683 }, { "epoch": 0.42, "grad_norm": 1.261348159658844, "learning_rate": 0.0006491299200283054, "loss": 4.0749, "step": 3684 }, { "epoch": 0.42, "grad_norm": 0.7468608260620027, "learning_rate": 0.0006489526664467565, "loss": 3.8891, "step": 3685 }, { "epoch": 0.42, "grad_norm": 0.819720362809581, "learning_rate": 0.0006487753923200264, "loss": 3.7673, "step": 3686 }, { "epoch": 0.42, "grad_norm": 3.613444781879377, "learning_rate": 0.0006485980976725671, "loss": 3.7678, "step": 3687 }, { "epoch": 0.42, "grad_norm": 0.7380531807017446, "learning_rate": 0.0006484207825288326, "loss": 3.7881, "step": 3688 }, { "epoch": 0.42, "grad_norm": 0.6475858802660085, "learning_rate": 0.0006482434469132803, "loss": 3.6697, "step": 3689 }, { "epoch": 0.42, "grad_norm": 4.048467168263876, "learning_rate": 0.0006480660908503704, "loss": 3.7103, "step": 3690 }, { "epoch": 0.42, "grad_norm": 0.7014295452916026, "learning_rate": 0.0006478887143645655, "loss": 4.0226, "step": 3691 }, { "epoch": 0.42, "grad_norm": 0.741809768005646, "learning_rate": 0.0006477113174803317, "loss": 3.816, "step": 3692 }, { "epoch": 0.42, "grad_norm": 0.6615304776421617, "learning_rate": 0.0006475339002221371, "loss": 3.7205, "step": 3693 }, { "epoch": 0.42, "grad_norm": 0.945354155044178, "learning_rate": 0.0006473564626144532, "loss": 3.9089, "step": 3694 }, { "epoch": 0.42, "grad_norm": 0.9658463791837414, "learning_rate": 0.0006471790046817542, "loss": 3.7785, "step": 3695 }, { "epoch": 0.42, "grad_norm": 0.9449788669096689, "learning_rate": 0.0006470015264485172, "loss": 3.7954, "step": 3696 }, { "epoch": 0.42, "grad_norm": 0.6775048438438243, "learning_rate": 0.0006468240279392214, "loss": 3.9567, "step": 3697 }, { "epoch": 0.42, "grad_norm": 0.7697037014662359, "learning_rate": 0.0006466465091783497, "loss": 3.8592, "step": 3698 }, { "epoch": 0.42, "grad_norm": 0.7115771917537173, "learning_rate": 0.0006464689701903876, "loss": 3.9671, "step": 3699 }, { "epoch": 0.42, "grad_norm": 1.1276198651134772, "learning_rate": 0.000646291410999823, "loss": 3.8976, "step": 3700 }, { "epoch": 0.42, "grad_norm": 0.7536288254825133, "learning_rate": 0.0006461138316311467, "loss": 4.0387, "step": 3701 }, { "epoch": 0.42, "grad_norm": 1.050197137787701, "learning_rate": 0.0006459362321088527, "loss": 3.8736, "step": 3702 }, { "epoch": 0.42, "grad_norm": 0.8700220023279057, "learning_rate": 0.000645758612457437, "loss": 3.3618, "step": 3703 }, { "epoch": 0.42, "grad_norm": 1.0681145020954401, "learning_rate": 0.0006455809727013992, "loss": 3.8805, "step": 3704 }, { "epoch": 0.42, "grad_norm": 0.6171896523842016, "learning_rate": 0.0006454033128652414, "loss": 3.9272, "step": 3705 }, { "epoch": 0.42, "grad_norm": 0.6184764056962017, "learning_rate": 0.0006452256329734682, "loss": 3.9003, "step": 3706 }, { "epoch": 0.43, "grad_norm": 0.9385573416245611, "learning_rate": 0.0006450479330505869, "loss": 3.9125, "step": 3707 }, { "epoch": 0.43, "grad_norm": 0.7563145905460674, "learning_rate": 0.0006448702131211084, "loss": 4.0483, "step": 3708 }, { "epoch": 0.43, "grad_norm": 1.1687823804136304, "learning_rate": 0.0006446924732095455, "loss": 3.6856, "step": 3709 }, { "epoch": 0.43, "grad_norm": 0.698660175811671, "learning_rate": 0.0006445147133404139, "loss": 3.9726, "step": 3710 }, { "epoch": 0.43, "grad_norm": 2.6312829353790628, "learning_rate": 0.0006443369335382322, "loss": 3.843, "step": 3711 }, { "epoch": 0.43, "grad_norm": 0.9852225324274464, "learning_rate": 0.000644159133827522, "loss": 3.7733, "step": 3712 }, { "epoch": 0.43, "grad_norm": 0.8437478778667405, "learning_rate": 0.000643981314232807, "loss": 4.0346, "step": 3713 }, { "epoch": 0.43, "grad_norm": 0.9475562089146817, "learning_rate": 0.0006438034747786144, "loss": 3.9736, "step": 3714 }, { "epoch": 0.43, "grad_norm": 0.8636962588237793, "learning_rate": 0.0006436256154894737, "loss": 3.9592, "step": 3715 }, { "epoch": 0.43, "grad_norm": 0.7318645508722592, "learning_rate": 0.0006434477363899168, "loss": 3.5851, "step": 3716 }, { "epoch": 0.43, "grad_norm": 0.8944596866954657, "learning_rate": 0.0006432698375044793, "loss": 3.6354, "step": 3717 }, { "epoch": 0.43, "grad_norm": 0.7418584414303938, "learning_rate": 0.0006430919188576986, "loss": 4.0396, "step": 3718 }, { "epoch": 0.43, "grad_norm": 0.6754508156196104, "learning_rate": 0.0006429139804741151, "loss": 3.9345, "step": 3719 }, { "epoch": 0.43, "grad_norm": 0.7262016383370631, "learning_rate": 0.0006427360223782726, "loss": 3.7747, "step": 3720 }, { "epoch": 0.43, "grad_norm": 1.085011955327319, "learning_rate": 0.0006425580445947163, "loss": 3.7996, "step": 3721 }, { "epoch": 0.43, "grad_norm": 0.6958015526240903, "learning_rate": 0.000642380047147995, "loss": 3.9303, "step": 3722 }, { "epoch": 0.43, "grad_norm": 0.7756311469538366, "learning_rate": 0.0006422020300626604, "loss": 3.9284, "step": 3723 }, { "epoch": 0.43, "grad_norm": 0.7304329490179511, "learning_rate": 0.0006420239933632666, "loss": 3.8593, "step": 3724 }, { "epoch": 0.43, "grad_norm": 0.7731441059496545, "learning_rate": 0.0006418459370743698, "loss": 4.1534, "step": 3725 }, { "epoch": 0.43, "grad_norm": 0.7057355384832473, "learning_rate": 0.0006416678612205298, "loss": 4.2048, "step": 3726 }, { "epoch": 0.43, "grad_norm": 0.8596051252797567, "learning_rate": 0.000641489765826309, "loss": 3.7626, "step": 3727 }, { "epoch": 0.43, "grad_norm": 0.7071178947738033, "learning_rate": 0.0006413116509162719, "loss": 4.096, "step": 3728 }, { "epoch": 0.43, "grad_norm": 0.846406056639936, "learning_rate": 0.0006411335165149863, "loss": 3.9744, "step": 3729 }, { "epoch": 0.43, "grad_norm": 0.7956650646404932, "learning_rate": 0.0006409553626470223, "loss": 4.018, "step": 3730 }, { "epoch": 0.43, "grad_norm": 1.370395578034691, "learning_rate": 0.0006407771893369529, "loss": 3.8198, "step": 3731 }, { "epoch": 0.43, "grad_norm": 0.7291617503274787, "learning_rate": 0.0006405989966093536, "loss": 4.1521, "step": 3732 }, { "epoch": 0.43, "grad_norm": 1.1926649514143501, "learning_rate": 0.0006404207844888029, "loss": 3.7767, "step": 3733 }, { "epoch": 0.43, "grad_norm": 0.7000784759258993, "learning_rate": 0.0006402425529998816, "loss": 3.8244, "step": 3734 }, { "epoch": 0.43, "grad_norm": 0.8825095919773164, "learning_rate": 0.0006400643021671734, "loss": 3.8621, "step": 3735 }, { "epoch": 0.43, "grad_norm": 0.7972369948027153, "learning_rate": 0.0006398860320152645, "loss": 3.9272, "step": 3736 }, { "epoch": 0.43, "grad_norm": 0.7064902937564493, "learning_rate": 0.0006397077425687441, "loss": 3.8965, "step": 3737 }, { "epoch": 0.43, "grad_norm": 1.0918181228874975, "learning_rate": 0.0006395294338522034, "loss": 3.894, "step": 3738 }, { "epoch": 0.43, "grad_norm": 0.6824664291979713, "learning_rate": 0.0006393511058902373, "loss": 3.874, "step": 3739 }, { "epoch": 0.43, "grad_norm": 0.6898578015312339, "learning_rate": 0.0006391727587074423, "loss": 3.6875, "step": 3740 }, { "epoch": 0.43, "grad_norm": 0.8317088221810423, "learning_rate": 0.000638994392328418, "loss": 3.8562, "step": 3741 }, { "epoch": 0.43, "grad_norm": 1.0846790795137597, "learning_rate": 0.0006388160067777669, "loss": 4.103, "step": 3742 }, { "epoch": 0.43, "grad_norm": 0.7052006208972414, "learning_rate": 0.0006386376020800936, "loss": 3.8531, "step": 3743 }, { "epoch": 0.43, "grad_norm": 0.6950664342453873, "learning_rate": 0.0006384591782600058, "loss": 3.6479, "step": 3744 }, { "epoch": 0.43, "grad_norm": 1.084692729831116, "learning_rate": 0.0006382807353421138, "loss": 3.7663, "step": 3745 }, { "epoch": 0.43, "grad_norm": 1.7983085097542468, "learning_rate": 0.0006381022733510299, "loss": 3.8366, "step": 3746 }, { "epoch": 0.43, "grad_norm": 1.3946214392354745, "learning_rate": 0.0006379237923113701, "loss": 3.7164, "step": 3747 }, { "epoch": 0.43, "grad_norm": 0.6749154996343165, "learning_rate": 0.0006377452922477523, "loss": 3.9446, "step": 3748 }, { "epoch": 0.43, "grad_norm": 0.8204211831389304, "learning_rate": 0.0006375667731847969, "loss": 3.6819, "step": 3749 }, { "epoch": 0.43, "grad_norm": 0.7142581119238758, "learning_rate": 0.0006373882351471275, "loss": 3.9218, "step": 3750 }, { "epoch": 0.43, "grad_norm": 0.7775280742306231, "learning_rate": 0.0006372096781593699, "loss": 3.9553, "step": 3751 }, { "epoch": 0.43, "grad_norm": 0.7904166404014514, "learning_rate": 0.0006370311022461528, "loss": 3.9241, "step": 3752 }, { "epoch": 0.43, "grad_norm": 1.8894756437351268, "learning_rate": 0.000636852507432107, "loss": 4.2848, "step": 3753 }, { "epoch": 0.43, "grad_norm": 0.803363789617827, "learning_rate": 0.0006366738937418664, "loss": 3.7317, "step": 3754 }, { "epoch": 0.43, "grad_norm": 0.7746049738677555, "learning_rate": 0.0006364952612000676, "loss": 3.8695, "step": 3755 }, { "epoch": 0.43, "grad_norm": 0.635098670605469, "learning_rate": 0.0006363166098313492, "loss": 3.9366, "step": 3756 }, { "epoch": 0.43, "grad_norm": 4.185261732914754, "learning_rate": 0.0006361379396603529, "loss": 3.8868, "step": 3757 }, { "epoch": 0.43, "grad_norm": 2.053941184938164, "learning_rate": 0.0006359592507117229, "loss": 3.8142, "step": 3758 }, { "epoch": 0.43, "grad_norm": 0.7419551093934218, "learning_rate": 0.000635780543010106, "loss": 3.7208, "step": 3759 }, { "epoch": 0.43, "grad_norm": 2.3655220298096507, "learning_rate": 0.0006356018165801511, "loss": 4.0424, "step": 3760 }, { "epoch": 0.43, "grad_norm": 1.1956640378076362, "learning_rate": 0.0006354230714465107, "loss": 4.1285, "step": 3761 }, { "epoch": 0.43, "grad_norm": 0.7505211358734369, "learning_rate": 0.000635244307633839, "loss": 3.8976, "step": 3762 }, { "epoch": 0.43, "grad_norm": 0.6834290494996135, "learning_rate": 0.0006350655251667927, "loss": 3.9982, "step": 3763 }, { "epoch": 0.43, "grad_norm": 0.9894235402428392, "learning_rate": 0.0006348867240700321, "loss": 3.7627, "step": 3764 }, { "epoch": 0.43, "grad_norm": 1.0128268139648133, "learning_rate": 0.0006347079043682191, "loss": 3.9649, "step": 3765 }, { "epoch": 0.43, "grad_norm": 1.2611444967926502, "learning_rate": 0.0006345290660860184, "loss": 3.9048, "step": 3766 }, { "epoch": 0.43, "grad_norm": 0.703234359930271, "learning_rate": 0.0006343502092480973, "loss": 3.7623, "step": 3767 }, { "epoch": 0.43, "grad_norm": 0.6439519316790526, "learning_rate": 0.0006341713338791258, "loss": 3.7922, "step": 3768 }, { "epoch": 0.43, "grad_norm": 0.7375251828304318, "learning_rate": 0.0006339924400037765, "loss": 3.9528, "step": 3769 }, { "epoch": 0.43, "grad_norm": 1.1907004242770522, "learning_rate": 0.0006338135276467241, "loss": 3.7563, "step": 3770 }, { "epoch": 0.43, "grad_norm": 0.8517710600016476, "learning_rate": 0.0006336345968326462, "loss": 4.0626, "step": 3771 }, { "epoch": 0.43, "grad_norm": 0.8255803332077114, "learning_rate": 0.0006334556475862231, "loss": 3.8626, "step": 3772 }, { "epoch": 0.43, "grad_norm": 0.6874264956249384, "learning_rate": 0.0006332766799321372, "loss": 3.8582, "step": 3773 }, { "epoch": 0.43, "grad_norm": 1.4051199225362898, "learning_rate": 0.000633097693895074, "loss": 4.158, "step": 3774 }, { "epoch": 0.43, "grad_norm": 0.6309842559261709, "learning_rate": 0.0006329186894997208, "loss": 3.8197, "step": 3775 }, { "epoch": 0.43, "grad_norm": 0.9933881605020008, "learning_rate": 0.0006327396667707682, "loss": 4.0231, "step": 3776 }, { "epoch": 0.43, "grad_norm": 0.7494055877320432, "learning_rate": 0.0006325606257329086, "loss": 3.836, "step": 3777 }, { "epoch": 0.43, "grad_norm": 0.9225578497354067, "learning_rate": 0.0006323815664108376, "loss": 3.8303, "step": 3778 }, { "epoch": 0.43, "grad_norm": 0.725856523931713, "learning_rate": 0.000632202488829253, "loss": 4.0584, "step": 3779 }, { "epoch": 0.43, "grad_norm": 0.6801632682200957, "learning_rate": 0.0006320233930128551, "loss": 3.9711, "step": 3780 }, { "epoch": 0.43, "grad_norm": 1.975049527889184, "learning_rate": 0.0006318442789863466, "loss": 3.6658, "step": 3781 }, { "epoch": 0.43, "grad_norm": 0.6988468696080055, "learning_rate": 0.0006316651467744332, "loss": 4.0369, "step": 3782 }, { "epoch": 0.43, "grad_norm": 0.6455387305722126, "learning_rate": 0.0006314859964018224, "loss": 3.9406, "step": 3783 }, { "epoch": 0.43, "grad_norm": 1.3362657685600379, "learning_rate": 0.0006313068278932248, "loss": 3.9254, "step": 3784 }, { "epoch": 0.43, "grad_norm": 1.0406712983260975, "learning_rate": 0.0006311276412733532, "loss": 3.9088, "step": 3785 }, { "epoch": 0.43, "grad_norm": 0.8624483423504795, "learning_rate": 0.000630948436566923, "loss": 4.0839, "step": 3786 }, { "epoch": 0.43, "grad_norm": 1.203492173793301, "learning_rate": 0.0006307692137986522, "loss": 4.095, "step": 3787 }, { "epoch": 0.43, "grad_norm": 0.8557631766646576, "learning_rate": 0.0006305899729932608, "loss": 3.9382, "step": 3788 }, { "epoch": 0.43, "grad_norm": 1.034005549179806, "learning_rate": 0.0006304107141754721, "loss": 3.7617, "step": 3789 }, { "epoch": 0.43, "grad_norm": 0.7000834086847256, "learning_rate": 0.0006302314373700113, "loss": 3.8549, "step": 3790 }, { "epoch": 0.43, "grad_norm": 0.6576694450734466, "learning_rate": 0.000630052142601606, "loss": 3.7212, "step": 3791 }, { "epoch": 0.43, "grad_norm": 1.337894921109633, "learning_rate": 0.0006298728298949866, "loss": 4.0012, "step": 3792 }, { "epoch": 0.43, "grad_norm": 0.780509051578872, "learning_rate": 0.0006296934992748859, "loss": 4.219, "step": 3793 }, { "epoch": 0.44, "grad_norm": 0.7069317810191248, "learning_rate": 0.0006295141507660394, "loss": 4.0125, "step": 3794 }, { "epoch": 0.44, "grad_norm": 0.9012368199496167, "learning_rate": 0.0006293347843931844, "loss": 4.0751, "step": 3795 }, { "epoch": 0.44, "grad_norm": 1.4274593084796288, "learning_rate": 0.0006291554001810612, "loss": 4.0125, "step": 3796 }, { "epoch": 0.44, "grad_norm": 0.9713563953228154, "learning_rate": 0.0006289759981544126, "loss": 4.0642, "step": 3797 }, { "epoch": 0.44, "grad_norm": 0.7912659801252071, "learning_rate": 0.0006287965783379834, "loss": 3.7896, "step": 3798 }, { "epoch": 0.44, "grad_norm": 1.3236594349836928, "learning_rate": 0.0006286171407565214, "loss": 3.989, "step": 3799 }, { "epoch": 0.44, "grad_norm": 0.7159162593771167, "learning_rate": 0.0006284376854347766, "loss": 4.052, "step": 3800 }, { "epoch": 0.44, "grad_norm": 0.6303280976942193, "learning_rate": 0.0006282582123975011, "loss": 3.7241, "step": 3801 }, { "epoch": 0.44, "grad_norm": 0.9052411874613575, "learning_rate": 0.0006280787216694502, "loss": 3.9503, "step": 3802 }, { "epoch": 0.44, "grad_norm": 0.764718076058984, "learning_rate": 0.000627899213275381, "loss": 3.8855, "step": 3803 }, { "epoch": 0.44, "grad_norm": 0.656213815623388, "learning_rate": 0.0006277196872400534, "loss": 4.0296, "step": 3804 }, { "epoch": 0.44, "grad_norm": 1.625823254437847, "learning_rate": 0.0006275401435882294, "loss": 4.1079, "step": 3805 }, { "epoch": 0.44, "grad_norm": 0.8964607149406679, "learning_rate": 0.0006273605823446738, "loss": 3.8305, "step": 3806 }, { "epoch": 0.44, "grad_norm": 1.3291432040284095, "learning_rate": 0.0006271810035341534, "loss": 3.9852, "step": 3807 }, { "epoch": 0.44, "grad_norm": 0.757861965610753, "learning_rate": 0.000627001407181438, "loss": 3.8936, "step": 3808 }, { "epoch": 0.44, "grad_norm": 0.7529189442058805, "learning_rate": 0.0006268217933112994, "loss": 3.9598, "step": 3809 }, { "epoch": 0.44, "grad_norm": 0.8693786935937036, "learning_rate": 0.0006266421619485116, "loss": 4.0094, "step": 3810 }, { "epoch": 0.44, "grad_norm": 0.997463111911183, "learning_rate": 0.0006264625131178519, "loss": 3.8569, "step": 3811 }, { "epoch": 0.44, "grad_norm": 0.8706933113218259, "learning_rate": 0.000626282846844099, "loss": 3.8064, "step": 3812 }, { "epoch": 0.44, "grad_norm": 1.26405329903931, "learning_rate": 0.0006261031631520345, "loss": 3.839, "step": 3813 }, { "epoch": 0.44, "grad_norm": 0.7038556353585114, "learning_rate": 0.0006259234620664423, "loss": 3.8967, "step": 3814 }, { "epoch": 0.44, "grad_norm": 1.0497885641254059, "learning_rate": 0.0006257437436121091, "loss": 3.9833, "step": 3815 }, { "epoch": 0.44, "grad_norm": 0.948149810567662, "learning_rate": 0.0006255640078138229, "loss": 3.9623, "step": 3816 }, { "epoch": 0.44, "grad_norm": 1.0727729948648244, "learning_rate": 0.0006253842546963757, "loss": 3.8611, "step": 3817 }, { "epoch": 0.44, "grad_norm": 0.7004766938447416, "learning_rate": 0.0006252044842845606, "loss": 3.9186, "step": 3818 }, { "epoch": 0.44, "grad_norm": 0.8106607549597292, "learning_rate": 0.0006250246966031733, "loss": 4.0344, "step": 3819 }, { "epoch": 0.44, "grad_norm": 1.0977659207426198, "learning_rate": 0.0006248448916770124, "loss": 3.8798, "step": 3820 }, { "epoch": 0.44, "grad_norm": 7.6286003430436775, "learning_rate": 0.0006246650695308784, "loss": 3.9351, "step": 3821 }, { "epoch": 0.44, "grad_norm": 0.7462135001013377, "learning_rate": 0.0006244852301895745, "loss": 3.8828, "step": 3822 }, { "epoch": 0.44, "grad_norm": 0.7923451810724231, "learning_rate": 0.0006243053736779058, "loss": 3.8462, "step": 3823 }, { "epoch": 0.44, "grad_norm": 1.1281722713053233, "learning_rate": 0.0006241255000206806, "loss": 4.0794, "step": 3824 }, { "epoch": 0.44, "grad_norm": 1.0302631069074202, "learning_rate": 0.0006239456092427085, "loss": 3.8729, "step": 3825 }, { "epoch": 0.44, "grad_norm": 0.7843121879844208, "learning_rate": 0.0006237657013688022, "loss": 4.081, "step": 3826 }, { "epoch": 0.44, "grad_norm": 1.21057190127897, "learning_rate": 0.0006235857764237767, "loss": 4.0095, "step": 3827 }, { "epoch": 0.44, "grad_norm": 1.187828130976851, "learning_rate": 0.0006234058344324491, "loss": 3.7299, "step": 3828 }, { "epoch": 0.44, "grad_norm": 0.7066165066899703, "learning_rate": 0.0006232258754196388, "loss": 3.9171, "step": 3829 }, { "epoch": 0.44, "grad_norm": 0.8401418523748041, "learning_rate": 0.0006230458994101681, "loss": 3.9376, "step": 3830 }, { "epoch": 0.44, "grad_norm": 0.7475419255688679, "learning_rate": 0.0006228659064288609, "loss": 4.0371, "step": 3831 }, { "epoch": 0.44, "grad_norm": 0.7868335833500767, "learning_rate": 0.0006226858965005439, "loss": 3.9867, "step": 3832 }, { "epoch": 0.44, "grad_norm": 0.8567653081142867, "learning_rate": 0.0006225058696500462, "loss": 3.9825, "step": 3833 }, { "epoch": 0.44, "grad_norm": 0.652228465549666, "learning_rate": 0.000622325825902199, "loss": 3.6515, "step": 3834 }, { "epoch": 0.44, "grad_norm": 0.8844679466319197, "learning_rate": 0.0006221457652818357, "loss": 3.8354, "step": 3835 }, { "epoch": 0.44, "grad_norm": 0.6973173487253017, "learning_rate": 0.0006219656878137925, "loss": 3.9639, "step": 3836 }, { "epoch": 0.44, "grad_norm": 0.7823791606433925, "learning_rate": 0.0006217855935229075, "loss": 3.9177, "step": 3837 }, { "epoch": 0.44, "grad_norm": 0.8111451039804923, "learning_rate": 0.0006216054824340212, "loss": 4.0611, "step": 3838 }, { "epoch": 0.44, "grad_norm": 0.6866882162949008, "learning_rate": 0.0006214253545719768, "loss": 4.0965, "step": 3839 }, { "epoch": 0.44, "grad_norm": 0.9645942858723421, "learning_rate": 0.0006212452099616194, "loss": 3.8414, "step": 3840 }, { "epoch": 0.44, "grad_norm": 1.6830129164079284, "learning_rate": 0.0006210650486277961, "loss": 3.9337, "step": 3841 }, { "epoch": 0.44, "grad_norm": 0.6889218885560208, "learning_rate": 0.0006208848705953573, "loss": 3.7494, "step": 3842 }, { "epoch": 0.44, "grad_norm": 0.6900392225947852, "learning_rate": 0.0006207046758891548, "loss": 4.088, "step": 3843 }, { "epoch": 0.44, "grad_norm": 0.677004454022921, "learning_rate": 0.0006205244645340431, "loss": 4.0048, "step": 3844 }, { "epoch": 0.44, "grad_norm": 1.2283148999718763, "learning_rate": 0.0006203442365548791, "loss": 3.9655, "step": 3845 }, { "epoch": 0.44, "grad_norm": 0.7354157420234919, "learning_rate": 0.0006201639919765214, "loss": 3.9625, "step": 3846 }, { "epoch": 0.44, "grad_norm": 0.9333223637695195, "learning_rate": 0.0006199837308238315, "loss": 4.0, "step": 3847 }, { "epoch": 0.44, "grad_norm": 0.6596517220011557, "learning_rate": 0.0006198034531216731, "loss": 3.9253, "step": 3848 }, { "epoch": 0.44, "grad_norm": 0.7270658384584764, "learning_rate": 0.0006196231588949121, "loss": 3.7699, "step": 3849 }, { "epoch": 0.44, "grad_norm": 0.8969469672690581, "learning_rate": 0.0006194428481684166, "loss": 4.1532, "step": 3850 }, { "epoch": 0.44, "grad_norm": 0.7671450336615364, "learning_rate": 0.0006192625209670568, "loss": 4.1748, "step": 3851 }, { "epoch": 0.44, "grad_norm": 0.9183685224952616, "learning_rate": 0.0006190821773157058, "loss": 3.7049, "step": 3852 }, { "epoch": 0.44, "grad_norm": 0.6950220384098033, "learning_rate": 0.0006189018172392382, "loss": 3.7653, "step": 3853 }, { "epoch": 0.44, "grad_norm": 1.4642742251007173, "learning_rate": 0.0006187214407625313, "loss": 3.8236, "step": 3854 }, { "epoch": 0.44, "grad_norm": 1.1025943609692397, "learning_rate": 0.000618541047910465, "loss": 4.1097, "step": 3855 }, { "epoch": 0.44, "grad_norm": 0.7710799468736492, "learning_rate": 0.0006183606387079205, "loss": 3.6931, "step": 3856 }, { "epoch": 0.44, "grad_norm": 0.7284051428676792, "learning_rate": 0.0006181802131797821, "loss": 3.719, "step": 3857 }, { "epoch": 0.44, "grad_norm": 0.7852559383533071, "learning_rate": 0.0006179997713509359, "loss": 4.0149, "step": 3858 }, { "epoch": 0.44, "grad_norm": 0.9392422680763604, "learning_rate": 0.0006178193132462706, "loss": 3.9228, "step": 3859 }, { "epoch": 0.44, "grad_norm": 1.9569336960484904, "learning_rate": 0.000617638838890677, "loss": 3.7485, "step": 3860 }, { "epoch": 0.44, "grad_norm": 0.8201435614838167, "learning_rate": 0.0006174583483090478, "loss": 3.8718, "step": 3861 }, { "epoch": 0.44, "grad_norm": 0.7036772917473395, "learning_rate": 0.0006172778415262785, "loss": 4.0281, "step": 3862 }, { "epoch": 0.44, "grad_norm": 1.2744609108869263, "learning_rate": 0.0006170973185672664, "loss": 3.9149, "step": 3863 }, { "epoch": 0.44, "grad_norm": 0.7328722980347919, "learning_rate": 0.0006169167794569114, "loss": 3.719, "step": 3864 }, { "epoch": 0.44, "grad_norm": 2.041355506396354, "learning_rate": 0.0006167362242201153, "loss": 3.8833, "step": 3865 }, { "epoch": 0.44, "grad_norm": 0.6865209081952167, "learning_rate": 0.000616555652881782, "loss": 3.963, "step": 3866 }, { "epoch": 0.44, "grad_norm": 0.9013281681528934, "learning_rate": 0.0006163750654668186, "loss": 4.0113, "step": 3867 }, { "epoch": 0.44, "grad_norm": 0.6803745049542401, "learning_rate": 0.0006161944620001328, "loss": 3.8436, "step": 3868 }, { "epoch": 0.44, "grad_norm": 0.7047741107542304, "learning_rate": 0.0006160138425066361, "loss": 4.0263, "step": 3869 }, { "epoch": 0.44, "grad_norm": 0.8544811983718549, "learning_rate": 0.0006158332070112412, "loss": 3.7659, "step": 3870 }, { "epoch": 0.44, "grad_norm": 0.808562252633427, "learning_rate": 0.0006156525555388633, "loss": 3.8125, "step": 3871 }, { "epoch": 0.44, "grad_norm": 0.7544349216413252, "learning_rate": 0.0006154718881144199, "loss": 3.9262, "step": 3872 }, { "epoch": 0.44, "grad_norm": 0.654436572147052, "learning_rate": 0.0006152912047628307, "loss": 3.6394, "step": 3873 }, { "epoch": 0.44, "grad_norm": 1.3359607373269553, "learning_rate": 0.0006151105055090174, "loss": 3.6829, "step": 3874 }, { "epoch": 0.44, "grad_norm": 0.7261747694635734, "learning_rate": 0.0006149297903779042, "loss": 4.0045, "step": 3875 }, { "epoch": 0.44, "grad_norm": 0.9180650971812644, "learning_rate": 0.000614749059394417, "loss": 4.0597, "step": 3876 }, { "epoch": 0.44, "grad_norm": 1.0297187361890525, "learning_rate": 0.0006145683125834846, "loss": 3.8541, "step": 3877 }, { "epoch": 0.44, "grad_norm": 0.7771359011792478, "learning_rate": 0.000614387549970037, "loss": 4.0969, "step": 3878 }, { "epoch": 0.44, "grad_norm": 0.6705168358919751, "learning_rate": 0.0006142067715790077, "loss": 3.7241, "step": 3879 }, { "epoch": 0.44, "grad_norm": 1.6560306865773018, "learning_rate": 0.000614025977435331, "loss": 3.9512, "step": 3880 }, { "epoch": 0.45, "grad_norm": 0.672772571343512, "learning_rate": 0.0006138451675639443, "loss": 3.7344, "step": 3881 }, { "epoch": 0.45, "grad_norm": 0.6979371728326691, "learning_rate": 0.0006136643419897867, "loss": 3.8024, "step": 3882 }, { "epoch": 0.45, "grad_norm": 0.7414831851049143, "learning_rate": 0.0006134835007377999, "loss": 3.8546, "step": 3883 }, { "epoch": 0.45, "grad_norm": 0.5940963497859821, "learning_rate": 0.0006133026438329274, "loss": 3.7084, "step": 3884 }, { "epoch": 0.45, "grad_norm": 1.2588771085254826, "learning_rate": 0.0006131217713001148, "loss": 3.8, "step": 3885 }, { "epoch": 0.45, "grad_norm": 0.6609886672321553, "learning_rate": 0.0006129408831643102, "loss": 3.9306, "step": 3886 }, { "epoch": 0.45, "grad_norm": 0.6687669021072068, "learning_rate": 0.0006127599794504637, "loss": 3.8259, "step": 3887 }, { "epoch": 0.45, "grad_norm": 0.7624180837771212, "learning_rate": 0.0006125790601835272, "loss": 4.0075, "step": 3888 }, { "epoch": 0.45, "grad_norm": 0.8872413031740498, "learning_rate": 0.0006123981253884556, "loss": 3.9274, "step": 3889 }, { "epoch": 0.45, "grad_norm": 0.7270700151711248, "learning_rate": 0.000612217175090205, "loss": 3.8627, "step": 3890 }, { "epoch": 0.45, "grad_norm": 0.9969938771407136, "learning_rate": 0.000612036209313734, "loss": 3.9977, "step": 3891 }, { "epoch": 0.45, "grad_norm": 0.8445647682726748, "learning_rate": 0.0006118552280840037, "loss": 4.1237, "step": 3892 }, { "epoch": 0.45, "grad_norm": 0.7942841549421065, "learning_rate": 0.0006116742314259769, "loss": 3.7552, "step": 3893 }, { "epoch": 0.45, "grad_norm": 0.699908449884627, "learning_rate": 0.0006114932193646184, "loss": 3.8433, "step": 3894 }, { "epoch": 0.45, "grad_norm": 0.7010019197324008, "learning_rate": 0.0006113121919248957, "loss": 3.8157, "step": 3895 }, { "epoch": 0.45, "grad_norm": 0.774480911025677, "learning_rate": 0.0006111311491317778, "loss": 3.9803, "step": 3896 }, { "epoch": 0.45, "grad_norm": 0.8127852261812015, "learning_rate": 0.0006109500910102362, "loss": 4.125, "step": 3897 }, { "epoch": 0.45, "grad_norm": 0.8453169588711743, "learning_rate": 0.0006107690175852445, "loss": 3.6863, "step": 3898 }, { "epoch": 0.45, "grad_norm": 1.0697018444829982, "learning_rate": 0.0006105879288817784, "loss": 3.836, "step": 3899 }, { "epoch": 0.45, "grad_norm": 0.6694550559614547, "learning_rate": 0.0006104068249248154, "loss": 3.7588, "step": 3900 }, { "epoch": 0.45, "grad_norm": 0.8187039580855431, "learning_rate": 0.0006102257057393354, "loss": 3.8192, "step": 3901 }, { "epoch": 0.45, "grad_norm": 0.736763157956692, "learning_rate": 0.0006100445713503206, "loss": 3.6476, "step": 3902 }, { "epoch": 0.45, "grad_norm": 0.68303623888795, "learning_rate": 0.0006098634217827545, "loss": 4.0022, "step": 3903 }, { "epoch": 0.45, "grad_norm": 0.7473083639356513, "learning_rate": 0.0006096822570616237, "loss": 3.8911, "step": 3904 }, { "epoch": 0.45, "grad_norm": 5.29278341793063, "learning_rate": 0.0006095010772119164, "loss": 3.9717, "step": 3905 }, { "epoch": 0.45, "grad_norm": 0.7676627554106904, "learning_rate": 0.0006093198822586226, "loss": 3.8882, "step": 3906 }, { "epoch": 0.45, "grad_norm": 0.8634542468287951, "learning_rate": 0.000609138672226735, "loss": 3.8558, "step": 3907 }, { "epoch": 0.45, "grad_norm": 0.7393096104437292, "learning_rate": 0.0006089574471412478, "loss": 3.9078, "step": 3908 }, { "epoch": 0.45, "grad_norm": 0.8481870973116039, "learning_rate": 0.0006087762070271578, "loss": 3.9145, "step": 3909 }, { "epoch": 0.45, "grad_norm": 0.8127930535670823, "learning_rate": 0.0006085949519094633, "loss": 3.8791, "step": 3910 }, { "epoch": 0.45, "grad_norm": 0.9521173062646392, "learning_rate": 0.0006084136818131654, "loss": 3.723, "step": 3911 }, { "epoch": 0.45, "grad_norm": 0.7567903634059139, "learning_rate": 0.0006082323967632667, "loss": 3.8699, "step": 3912 }, { "epoch": 0.45, "grad_norm": 0.7681700726889505, "learning_rate": 0.0006080510967847717, "loss": 3.941, "step": 3913 }, { "epoch": 0.45, "grad_norm": 0.648237673553497, "learning_rate": 0.0006078697819026878, "loss": 3.9685, "step": 3914 }, { "epoch": 0.45, "grad_norm": 0.9354775766472486, "learning_rate": 0.0006076884521420236, "loss": 3.7391, "step": 3915 }, { "epoch": 0.45, "grad_norm": 0.8840507745460041, "learning_rate": 0.00060750710752779, "loss": 3.9858, "step": 3916 }, { "epoch": 0.45, "grad_norm": 0.7572921639391266, "learning_rate": 0.0006073257480850004, "loss": 3.9341, "step": 3917 }, { "epoch": 0.45, "grad_norm": 0.7269881214526656, "learning_rate": 0.0006071443738386697, "loss": 3.8794, "step": 3918 }, { "epoch": 0.45, "grad_norm": 0.6625335461477138, "learning_rate": 0.0006069629848138148, "loss": 3.8181, "step": 3919 }, { "epoch": 0.45, "grad_norm": 0.7131466348402297, "learning_rate": 0.0006067815810354551, "loss": 3.8227, "step": 3920 }, { "epoch": 0.45, "grad_norm": 0.7190939218500794, "learning_rate": 0.0006066001625286118, "loss": 3.8487, "step": 3921 }, { "epoch": 0.45, "grad_norm": 0.8606347176437454, "learning_rate": 0.0006064187293183078, "loss": 3.8801, "step": 3922 }, { "epoch": 0.45, "grad_norm": 0.6754584878015163, "learning_rate": 0.0006062372814295689, "loss": 3.8428, "step": 3923 }, { "epoch": 0.45, "grad_norm": 0.8409182078776266, "learning_rate": 0.0006060558188874222, "loss": 3.6188, "step": 3924 }, { "epoch": 0.45, "grad_norm": 0.703828573032297, "learning_rate": 0.0006058743417168966, "loss": 3.9151, "step": 3925 }, { "epoch": 0.45, "grad_norm": 0.718944341346486, "learning_rate": 0.0006056928499430237, "loss": 4.1326, "step": 3926 }, { "epoch": 0.45, "grad_norm": 0.8162587180494851, "learning_rate": 0.0006055113435908372, "loss": 4.0417, "step": 3927 }, { "epoch": 0.45, "grad_norm": 0.7138506979309607, "learning_rate": 0.0006053298226853717, "loss": 3.7836, "step": 3928 }, { "epoch": 0.45, "grad_norm": 0.6880100518209579, "learning_rate": 0.0006051482872516652, "loss": 3.9509, "step": 3929 }, { "epoch": 0.45, "grad_norm": 0.700701373824494, "learning_rate": 0.0006049667373147566, "loss": 3.8258, "step": 3930 }, { "epoch": 0.45, "grad_norm": 0.7799432747634579, "learning_rate": 0.0006047851728996875, "loss": 3.761, "step": 3931 }, { "epoch": 0.45, "grad_norm": 0.8878572647682944, "learning_rate": 0.0006046035940315011, "loss": 4.1669, "step": 3932 }, { "epoch": 0.45, "grad_norm": 0.7789838409215758, "learning_rate": 0.0006044220007352429, "loss": 3.8294, "step": 3933 }, { "epoch": 0.45, "grad_norm": 1.3334866543701285, "learning_rate": 0.0006042403930359603, "loss": 3.7869, "step": 3934 }, { "epoch": 0.45, "grad_norm": 0.7160145140259562, "learning_rate": 0.0006040587709587024, "loss": 3.8822, "step": 3935 }, { "epoch": 0.45, "grad_norm": 0.7958013481321649, "learning_rate": 0.0006038771345285204, "loss": 3.9326, "step": 3936 }, { "epoch": 0.45, "grad_norm": 0.7470701512615304, "learning_rate": 0.000603695483770468, "loss": 3.6696, "step": 3937 }, { "epoch": 0.45, "grad_norm": 0.7499476865016649, "learning_rate": 0.0006035138187096, "loss": 4.2284, "step": 3938 }, { "epoch": 0.45, "grad_norm": 0.7270560939828855, "learning_rate": 0.0006033321393709741, "loss": 3.6399, "step": 3939 }, { "epoch": 0.45, "grad_norm": 0.9526943611041022, "learning_rate": 0.0006031504457796493, "loss": 3.9005, "step": 3940 }, { "epoch": 0.45, "grad_norm": 1.2402721360298194, "learning_rate": 0.0006029687379606864, "loss": 3.8153, "step": 3941 }, { "epoch": 0.45, "grad_norm": 0.6954270256027031, "learning_rate": 0.0006027870159391491, "loss": 3.6862, "step": 3942 }, { "epoch": 0.45, "grad_norm": 0.6516170766412759, "learning_rate": 0.0006026052797401022, "loss": 3.8669, "step": 3943 }, { "epoch": 0.45, "grad_norm": 0.7351148230308584, "learning_rate": 0.0006024235293886127, "loss": 3.8504, "step": 3944 }, { "epoch": 0.45, "grad_norm": 0.7513226017025635, "learning_rate": 0.0006022417649097499, "loss": 3.7981, "step": 3945 }, { "epoch": 0.45, "grad_norm": 0.7176468819724566, "learning_rate": 0.0006020599863285845, "loss": 3.857, "step": 3946 }, { "epoch": 0.45, "grad_norm": 0.6532634106151174, "learning_rate": 0.0006018781936701893, "loss": 3.9533, "step": 3947 }, { "epoch": 0.45, "grad_norm": 0.8807028514906022, "learning_rate": 0.0006016963869596392, "loss": 4.0785, "step": 3948 }, { "epoch": 0.45, "grad_norm": 0.9092147281644758, "learning_rate": 0.0006015145662220113, "loss": 3.8925, "step": 3949 }, { "epoch": 0.45, "grad_norm": 1.1079491151348881, "learning_rate": 0.0006013327314823838, "loss": 3.852, "step": 3950 }, { "epoch": 0.45, "grad_norm": 0.6956599013621375, "learning_rate": 0.0006011508827658376, "loss": 3.6848, "step": 3951 }, { "epoch": 0.45, "grad_norm": 0.7594131521520358, "learning_rate": 0.0006009690200974553, "loss": 4.0841, "step": 3952 }, { "epoch": 0.45, "grad_norm": 0.9913538573575377, "learning_rate": 0.0006007871435023213, "loss": 3.8201, "step": 3953 }, { "epoch": 0.45, "grad_norm": 0.8307382363811768, "learning_rate": 0.0006006052530055221, "loss": 3.6915, "step": 3954 }, { "epoch": 0.45, "grad_norm": 0.9259426117404608, "learning_rate": 0.0006004233486321459, "loss": 3.5436, "step": 3955 }, { "epoch": 0.45, "grad_norm": 0.6770684737967306, "learning_rate": 0.000600241430407283, "loss": 3.7751, "step": 3956 }, { "epoch": 0.45, "grad_norm": 0.6956102095152675, "learning_rate": 0.0006000594983560255, "loss": 3.7941, "step": 3957 }, { "epoch": 0.45, "grad_norm": 1.1238148049705925, "learning_rate": 0.0005998775525034675, "loss": 3.773, "step": 3958 }, { "epoch": 0.45, "grad_norm": 0.9382213891357185, "learning_rate": 0.0005996955928747051, "loss": 3.9914, "step": 3959 }, { "epoch": 0.45, "grad_norm": 0.8351440909480151, "learning_rate": 0.0005995136194948359, "loss": 3.7111, "step": 3960 }, { "epoch": 0.45, "grad_norm": 0.6654689646891007, "learning_rate": 0.0005993316323889598, "loss": 3.95, "step": 3961 }, { "epoch": 0.45, "grad_norm": 0.7707420839672277, "learning_rate": 0.0005991496315821786, "loss": 3.8026, "step": 3962 }, { "epoch": 0.45, "grad_norm": 0.6053860848942868, "learning_rate": 0.0005989676170995954, "loss": 3.9219, "step": 3963 }, { "epoch": 0.45, "grad_norm": 0.9217744353417774, "learning_rate": 0.0005987855889663163, "loss": 3.8673, "step": 3964 }, { "epoch": 0.45, "grad_norm": 1.8829062353637223, "learning_rate": 0.000598603547207448, "loss": 3.8126, "step": 3965 }, { "epoch": 0.45, "grad_norm": 0.8780133112452879, "learning_rate": 0.0005984214918480999, "loss": 3.8253, "step": 3966 }, { "epoch": 0.45, "grad_norm": 0.7107066455242557, "learning_rate": 0.0005982394229133832, "loss": 3.8856, "step": 3967 }, { "epoch": 0.45, "grad_norm": 0.7194443219797562, "learning_rate": 0.0005980573404284107, "loss": 3.83, "step": 3968 }, { "epoch": 0.46, "grad_norm": 0.6628328896318604, "learning_rate": 0.000597875244418297, "loss": 3.8545, "step": 3969 }, { "epoch": 0.46, "grad_norm": 0.6876200149666233, "learning_rate": 0.0005976931349081593, "loss": 4.1159, "step": 3970 }, { "epoch": 0.46, "grad_norm": 1.1904295656437875, "learning_rate": 0.0005975110119231157, "loss": 3.9251, "step": 3971 }, { "epoch": 0.46, "grad_norm": 0.8144771190860535, "learning_rate": 0.0005973288754882867, "loss": 4.0726, "step": 3972 }, { "epoch": 0.46, "grad_norm": 0.7140424771933278, "learning_rate": 0.0005971467256287947, "loss": 4.0284, "step": 3973 }, { "epoch": 0.46, "grad_norm": 0.6460297561347226, "learning_rate": 0.0005969645623697636, "loss": 3.9807, "step": 3974 }, { "epoch": 0.46, "grad_norm": 0.7098337361150873, "learning_rate": 0.0005967823857363195, "loss": 3.6378, "step": 3975 }, { "epoch": 0.46, "grad_norm": 0.6891991392954582, "learning_rate": 0.0005966001957535901, "loss": 3.7206, "step": 3976 }, { "epoch": 0.46, "grad_norm": 0.9392250273147966, "learning_rate": 0.000596417992446705, "loss": 3.7509, "step": 3977 }, { "epoch": 0.46, "grad_norm": 0.7722868649445921, "learning_rate": 0.0005962357758407958, "loss": 3.9044, "step": 3978 }, { "epoch": 0.46, "grad_norm": 0.7458267983310527, "learning_rate": 0.0005960535459609957, "loss": 3.9087, "step": 3979 }, { "epoch": 0.46, "grad_norm": 0.6663421957893071, "learning_rate": 0.00059587130283244, "loss": 3.9494, "step": 3980 }, { "epoch": 0.46, "grad_norm": 0.7744183852306287, "learning_rate": 0.0005956890464802654, "loss": 3.8424, "step": 3981 }, { "epoch": 0.46, "grad_norm": 2.144322928787315, "learning_rate": 0.0005955067769296109, "loss": 3.8649, "step": 3982 }, { "epoch": 0.46, "grad_norm": 0.9332367713656605, "learning_rate": 0.0005953244942056171, "loss": 3.7373, "step": 3983 }, { "epoch": 0.46, "grad_norm": 1.1302435251121092, "learning_rate": 0.0005951421983334263, "loss": 3.8145, "step": 3984 }, { "epoch": 0.46, "grad_norm": 1.2319475245888967, "learning_rate": 0.0005949598893381828, "loss": 3.8815, "step": 3985 }, { "epoch": 0.46, "grad_norm": 0.7532883834412366, "learning_rate": 0.0005947775672450326, "loss": 3.9378, "step": 3986 }, { "epoch": 0.46, "grad_norm": 1.808785383514495, "learning_rate": 0.0005945952320791239, "loss": 3.9121, "step": 3987 }, { "epoch": 0.46, "grad_norm": 1.5952123013919977, "learning_rate": 0.0005944128838656059, "loss": 3.8712, "step": 3988 }, { "epoch": 0.46, "grad_norm": 0.8193851950272943, "learning_rate": 0.0005942305226296302, "loss": 4.0175, "step": 3989 }, { "epoch": 0.46, "grad_norm": 0.7407518262070221, "learning_rate": 0.0005940481483963502, "loss": 3.8959, "step": 3990 }, { "epoch": 0.46, "grad_norm": 0.6201495312767138, "learning_rate": 0.0005938657611909206, "loss": 3.6479, "step": 3991 }, { "epoch": 0.46, "grad_norm": 0.7679695762149001, "learning_rate": 0.0005936833610384988, "loss": 3.8782, "step": 3992 }, { "epoch": 0.46, "grad_norm": 0.8137778103530727, "learning_rate": 0.0005935009479642431, "loss": 3.8492, "step": 3993 }, { "epoch": 0.46, "grad_norm": 0.7045152681220851, "learning_rate": 0.0005933185219933137, "loss": 3.9656, "step": 3994 }, { "epoch": 0.46, "grad_norm": 1.0439995008762348, "learning_rate": 0.0005931360831508732, "loss": 3.6638, "step": 3995 }, { "epoch": 0.46, "grad_norm": 0.7877972903464969, "learning_rate": 0.0005929536314620852, "loss": 3.8691, "step": 3996 }, { "epoch": 0.46, "grad_norm": 0.8134221172355398, "learning_rate": 0.0005927711669521156, "loss": 3.8238, "step": 3997 }, { "epoch": 0.46, "grad_norm": 0.6660344245344365, "learning_rate": 0.0005925886896461321, "loss": 3.7862, "step": 3998 }, { "epoch": 0.46, "grad_norm": 1.0096586537092138, "learning_rate": 0.0005924061995693036, "loss": 3.9487, "step": 3999 }, { "epoch": 0.46, "grad_norm": 0.637025741328825, "learning_rate": 0.0005922236967468013, "loss": 3.7122, "step": 4000 }, { "epoch": 0.46, "grad_norm": 0.6729756388716194, "learning_rate": 0.000592041181203798, "loss": 3.9193, "step": 4001 }, { "epoch": 0.46, "grad_norm": 0.6231806914162396, "learning_rate": 0.0005918586529654684, "loss": 3.7519, "step": 4002 }, { "epoch": 0.46, "grad_norm": 0.7923287890771714, "learning_rate": 0.0005916761120569883, "loss": 3.9566, "step": 4003 }, { "epoch": 0.46, "grad_norm": 0.7463618474417574, "learning_rate": 0.0005914935585035363, "loss": 3.7316, "step": 4004 }, { "epoch": 0.46, "grad_norm": 0.706784621929304, "learning_rate": 0.0005913109923302919, "loss": 3.7225, "step": 4005 }, { "epoch": 0.46, "grad_norm": 0.6459299100581579, "learning_rate": 0.0005911284135624365, "loss": 3.8081, "step": 4006 }, { "epoch": 0.46, "grad_norm": 0.7587194238155516, "learning_rate": 0.0005909458222251536, "loss": 3.9267, "step": 4007 }, { "epoch": 0.46, "grad_norm": 0.642713501558682, "learning_rate": 0.0005907632183436281, "loss": 3.9265, "step": 4008 }, { "epoch": 0.46, "grad_norm": 0.7030387786553617, "learning_rate": 0.0005905806019430468, "loss": 3.8092, "step": 4009 }, { "epoch": 0.46, "grad_norm": 0.6896741659945483, "learning_rate": 0.0005903979730485979, "loss": 4.0289, "step": 4010 }, { "epoch": 0.46, "grad_norm": 0.8449943944369459, "learning_rate": 0.0005902153316854718, "loss": 3.7637, "step": 4011 }, { "epoch": 0.46, "grad_norm": 0.9728986439513934, "learning_rate": 0.0005900326778788605, "loss": 4.0899, "step": 4012 }, { "epoch": 0.46, "grad_norm": 0.877977943243662, "learning_rate": 0.0005898500116539575, "loss": 3.7982, "step": 4013 }, { "epoch": 0.46, "grad_norm": 0.6658001842768428, "learning_rate": 0.0005896673330359579, "loss": 3.5347, "step": 4014 }, { "epoch": 0.46, "grad_norm": 0.7489395770189999, "learning_rate": 0.000589484642050059, "loss": 3.8631, "step": 4015 }, { "epoch": 0.46, "grad_norm": 0.9899072683126479, "learning_rate": 0.0005893019387214595, "loss": 3.9049, "step": 4016 }, { "epoch": 0.46, "grad_norm": 0.6483670030102223, "learning_rate": 0.0005891192230753597, "loss": 3.845, "step": 4017 }, { "epoch": 0.46, "grad_norm": 0.8569234907994701, "learning_rate": 0.000588936495136962, "loss": 4.0143, "step": 4018 }, { "epoch": 0.46, "grad_norm": 0.6899604451047773, "learning_rate": 0.0005887537549314699, "loss": 3.8409, "step": 4019 }, { "epoch": 0.46, "grad_norm": 0.7870744283473945, "learning_rate": 0.0005885710024840893, "loss": 3.6438, "step": 4020 }, { "epoch": 0.46, "grad_norm": 1.1626613890504514, "learning_rate": 0.000588388237820027, "loss": 3.7507, "step": 4021 }, { "epoch": 0.46, "grad_norm": 0.7294456385707205, "learning_rate": 0.0005882054609644923, "loss": 3.8542, "step": 4022 }, { "epoch": 0.46, "grad_norm": 0.6813342928663897, "learning_rate": 0.0005880226719426956, "loss": 3.833, "step": 4023 }, { "epoch": 0.46, "grad_norm": 0.8265497420682203, "learning_rate": 0.0005878398707798491, "loss": 3.8396, "step": 4024 }, { "epoch": 0.46, "grad_norm": 0.6502418531148979, "learning_rate": 0.0005876570575011668, "loss": 3.8268, "step": 4025 }, { "epoch": 0.46, "grad_norm": 0.8081651964017412, "learning_rate": 0.0005874742321318643, "loss": 3.7915, "step": 4026 }, { "epoch": 0.46, "grad_norm": 0.6405056653381834, "learning_rate": 0.0005872913946971591, "loss": 4.042, "step": 4027 }, { "epoch": 0.46, "grad_norm": 1.0418715291407483, "learning_rate": 0.0005871085452222697, "loss": 3.9346, "step": 4028 }, { "epoch": 0.46, "grad_norm": 0.6902224080608588, "learning_rate": 0.000586925683732417, "loss": 3.9474, "step": 4029 }, { "epoch": 0.46, "grad_norm": 0.695534552022739, "learning_rate": 0.0005867428102528233, "loss": 4.0311, "step": 4030 }, { "epoch": 0.46, "grad_norm": 0.6595772638556843, "learning_rate": 0.0005865599248087122, "loss": 3.8849, "step": 4031 }, { "epoch": 0.46, "grad_norm": 0.6586397009211438, "learning_rate": 0.0005863770274253095, "loss": 4.0574, "step": 4032 }, { "epoch": 0.46, "grad_norm": 0.7037311071940167, "learning_rate": 0.0005861941181278425, "loss": 3.666, "step": 4033 }, { "epoch": 0.46, "grad_norm": 0.7686096468420669, "learning_rate": 0.0005860111969415397, "loss": 3.841, "step": 4034 }, { "epoch": 0.46, "grad_norm": 1.067827921907857, "learning_rate": 0.0005858282638916319, "loss": 3.653, "step": 4035 }, { "epoch": 0.46, "grad_norm": 1.0678440426699534, "learning_rate": 0.0005856453190033512, "loss": 4.0077, "step": 4036 }, { "epoch": 0.46, "grad_norm": 0.9482149386075516, "learning_rate": 0.0005854623623019313, "loss": 3.8982, "step": 4037 }, { "epoch": 0.46, "grad_norm": 0.6717299006231169, "learning_rate": 0.0005852793938126074, "loss": 3.8567, "step": 4038 }, { "epoch": 0.46, "grad_norm": 0.8910574273442414, "learning_rate": 0.0005850964135606169, "loss": 4.1356, "step": 4039 }, { "epoch": 0.46, "grad_norm": 0.6369442711591836, "learning_rate": 0.000584913421571198, "loss": 3.7077, "step": 4040 }, { "epoch": 0.46, "grad_norm": 0.6912577911744474, "learning_rate": 0.0005847304178695914, "loss": 3.762, "step": 4041 }, { "epoch": 0.46, "grad_norm": 0.756680355246937, "learning_rate": 0.0005845474024810387, "loss": 3.9031, "step": 4042 }, { "epoch": 0.46, "grad_norm": 0.6930754095027756, "learning_rate": 0.0005843643754307834, "loss": 3.8602, "step": 4043 }, { "epoch": 0.46, "grad_norm": 2.918906277294368, "learning_rate": 0.0005841813367440707, "loss": 4.0204, "step": 4044 }, { "epoch": 0.46, "grad_norm": 0.6675832344146952, "learning_rate": 0.0005839982864461473, "loss": 4.0758, "step": 4045 }, { "epoch": 0.46, "grad_norm": 0.72416041628074, "learning_rate": 0.0005838152245622614, "loss": 3.9706, "step": 4046 }, { "epoch": 0.46, "grad_norm": 0.7283630056898933, "learning_rate": 0.0005836321511176628, "loss": 3.781, "step": 4047 }, { "epoch": 0.46, "grad_norm": 0.6709376338578446, "learning_rate": 0.0005834490661376033, "loss": 3.8546, "step": 4048 }, { "epoch": 0.46, "grad_norm": 1.43904934595593, "learning_rate": 0.0005832659696473356, "loss": 3.8627, "step": 4049 }, { "epoch": 0.46, "grad_norm": 1.040537084899615, "learning_rate": 0.0005830828616721148, "loss": 3.9885, "step": 4050 }, { "epoch": 0.46, "grad_norm": 0.7154095260739417, "learning_rate": 0.0005828997422371967, "loss": 3.8984, "step": 4051 }, { "epoch": 0.46, "grad_norm": 0.6840467466197048, "learning_rate": 0.0005827166113678398, "loss": 3.8197, "step": 4052 }, { "epoch": 0.46, "grad_norm": 0.6607271787741962, "learning_rate": 0.0005825334690893028, "loss": 3.9173, "step": 4053 }, { "epoch": 0.46, "grad_norm": 0.7044230839520097, "learning_rate": 0.0005823503154268468, "loss": 3.7764, "step": 4054 }, { "epoch": 0.46, "grad_norm": 1.8166876021379663, "learning_rate": 0.0005821671504057348, "loss": 3.6638, "step": 4055 }, { "epoch": 0.47, "grad_norm": 0.6873827813465201, "learning_rate": 0.0005819839740512305, "loss": 3.7156, "step": 4056 }, { "epoch": 0.47, "grad_norm": 0.7037228175288416, "learning_rate": 0.0005818007863885998, "loss": 3.785, "step": 4057 }, { "epoch": 0.47, "grad_norm": 0.8312718255739672, "learning_rate": 0.00058161758744311, "loss": 3.8489, "step": 4058 }, { "epoch": 0.47, "grad_norm": 1.3764847875423245, "learning_rate": 0.0005814343772400296, "loss": 3.8513, "step": 4059 }, { "epoch": 0.47, "grad_norm": 3.197207598596104, "learning_rate": 0.0005812511558046291, "loss": 3.7902, "step": 4060 }, { "epoch": 0.47, "grad_norm": 0.813055907960531, "learning_rate": 0.0005810679231621806, "loss": 4.0559, "step": 4061 }, { "epoch": 0.47, "grad_norm": 0.7727988839541204, "learning_rate": 0.0005808846793379575, "loss": 3.8883, "step": 4062 }, { "epoch": 0.47, "grad_norm": 0.6301698841032853, "learning_rate": 0.0005807014243572343, "loss": 3.8015, "step": 4063 }, { "epoch": 0.47, "grad_norm": 1.5943105513355782, "learning_rate": 0.0005805181582452882, "loss": 3.8756, "step": 4064 }, { "epoch": 0.47, "grad_norm": 0.7534833119870524, "learning_rate": 0.0005803348810273969, "loss": 4.0577, "step": 4065 }, { "epoch": 0.47, "grad_norm": 0.7816035082859037, "learning_rate": 0.0005801515927288401, "loss": 4.1597, "step": 4066 }, { "epoch": 0.47, "grad_norm": 0.7448180753505125, "learning_rate": 0.000579968293374899, "loss": 3.9524, "step": 4067 }, { "epoch": 0.47, "grad_norm": 0.7424659101627653, "learning_rate": 0.0005797849829908561, "loss": 3.73, "step": 4068 }, { "epoch": 0.47, "grad_norm": 0.6761235529711822, "learning_rate": 0.0005796016616019955, "loss": 4.0099, "step": 4069 }, { "epoch": 0.47, "grad_norm": 0.7006585154077003, "learning_rate": 0.0005794183292336032, "loss": 3.9295, "step": 4070 }, { "epoch": 0.47, "grad_norm": 2.0890965264795724, "learning_rate": 0.0005792349859109662, "loss": 4.0376, "step": 4071 }, { "epoch": 0.47, "grad_norm": 0.9797368360122771, "learning_rate": 0.0005790516316593733, "loss": 3.7838, "step": 4072 }, { "epoch": 0.47, "grad_norm": 0.8015717835554922, "learning_rate": 0.0005788682665041146, "loss": 3.9758, "step": 4073 }, { "epoch": 0.47, "grad_norm": 1.0416296525994264, "learning_rate": 0.0005786848904704818, "loss": 4.0572, "step": 4074 }, { "epoch": 0.47, "grad_norm": 0.8335661827897813, "learning_rate": 0.0005785015035837684, "loss": 3.9293, "step": 4075 }, { "epoch": 0.47, "grad_norm": 0.6416652829124934, "learning_rate": 0.000578318105869269, "loss": 3.629, "step": 4076 }, { "epoch": 0.47, "grad_norm": 0.7157879574166962, "learning_rate": 0.0005781346973522797, "loss": 3.9308, "step": 4077 }, { "epoch": 0.47, "grad_norm": 0.6979321181847583, "learning_rate": 0.0005779512780580981, "loss": 4.0069, "step": 4078 }, { "epoch": 0.47, "grad_norm": 0.7938756516818022, "learning_rate": 0.0005777678480120237, "loss": 3.9794, "step": 4079 }, { "epoch": 0.47, "grad_norm": 0.8791790499210107, "learning_rate": 0.0005775844072393573, "loss": 3.8914, "step": 4080 }, { "epoch": 0.47, "grad_norm": 0.6973336617665977, "learning_rate": 0.0005774009557654006, "loss": 3.6434, "step": 4081 }, { "epoch": 0.47, "grad_norm": 0.7815634466314726, "learning_rate": 0.0005772174936154573, "loss": 3.9924, "step": 4082 }, { "epoch": 0.47, "grad_norm": 0.6743844613904448, "learning_rate": 0.000577034020814833, "loss": 3.7524, "step": 4083 }, { "epoch": 0.47, "grad_norm": 0.6557253938943693, "learning_rate": 0.0005768505373888337, "loss": 3.9622, "step": 4084 }, { "epoch": 0.47, "grad_norm": 0.6909033663393117, "learning_rate": 0.0005766670433627677, "loss": 4.0295, "step": 4085 }, { "epoch": 0.47, "grad_norm": 1.779520355928976, "learning_rate": 0.0005764835387619444, "loss": 3.8868, "step": 4086 }, { "epoch": 0.47, "grad_norm": 0.93303596207143, "learning_rate": 0.000576300023611675, "loss": 3.5997, "step": 4087 }, { "epoch": 0.47, "grad_norm": 0.8315040665691679, "learning_rate": 0.0005761164979372716, "loss": 3.934, "step": 4088 }, { "epoch": 0.47, "grad_norm": 0.7323603611304464, "learning_rate": 0.0005759329617640483, "loss": 3.8249, "step": 4089 }, { "epoch": 0.47, "grad_norm": 0.6797167837873807, "learning_rate": 0.0005757494151173204, "loss": 3.7031, "step": 4090 }, { "epoch": 0.47, "grad_norm": 0.8462671702179962, "learning_rate": 0.0005755658580224043, "loss": 3.7597, "step": 4091 }, { "epoch": 0.47, "grad_norm": 1.1217306560067015, "learning_rate": 0.0005753822905046189, "loss": 3.813, "step": 4092 }, { "epoch": 0.47, "grad_norm": 0.804969021977847, "learning_rate": 0.0005751987125892833, "loss": 3.8371, "step": 4093 }, { "epoch": 0.47, "grad_norm": 0.720369326932804, "learning_rate": 0.0005750151243017187, "loss": 3.9555, "step": 4094 }, { "epoch": 0.47, "grad_norm": 0.7976701461708695, "learning_rate": 0.0005748315256672476, "loss": 3.7239, "step": 4095 }, { "epoch": 0.47, "grad_norm": 0.7503300173356574, "learning_rate": 0.000574647916711194, "loss": 3.9855, "step": 4096 }, { "epoch": 0.47, "grad_norm": 0.7039568727381197, "learning_rate": 0.000574464297458883, "loss": 3.7449, "step": 4097 }, { "epoch": 0.47, "grad_norm": 0.6916793154192109, "learning_rate": 0.000574280667935642, "loss": 4.0476, "step": 4098 }, { "epoch": 0.47, "grad_norm": 0.784478207844224, "learning_rate": 0.0005740970281667984, "loss": 3.7933, "step": 4099 }, { "epoch": 0.47, "grad_norm": 0.9239971176373486, "learning_rate": 0.0005739133781776824, "loss": 4.2081, "step": 4100 }, { "epoch": 0.47, "grad_norm": 0.8012430900709699, "learning_rate": 0.0005737297179936247, "loss": 3.9358, "step": 4101 }, { "epoch": 0.47, "grad_norm": 0.684685416070157, "learning_rate": 0.0005735460476399579, "loss": 3.7037, "step": 4102 }, { "epoch": 0.47, "grad_norm": 0.7062605132832812, "learning_rate": 0.0005733623671420156, "loss": 3.9287, "step": 4103 }, { "epoch": 0.47, "grad_norm": 0.6509487039510518, "learning_rate": 0.0005731786765251333, "loss": 3.8739, "step": 4104 }, { "epoch": 0.47, "grad_norm": 0.8772317262125492, "learning_rate": 0.0005729949758146475, "loss": 4.0113, "step": 4105 }, { "epoch": 0.47, "grad_norm": 0.6876106167293412, "learning_rate": 0.0005728112650358961, "loss": 3.7565, "step": 4106 }, { "epoch": 0.47, "grad_norm": 0.6715318428113342, "learning_rate": 0.0005726275442142185, "loss": 3.7601, "step": 4107 }, { "epoch": 0.47, "grad_norm": 0.7442237056033825, "learning_rate": 0.0005724438133749559, "loss": 4.0, "step": 4108 }, { "epoch": 0.47, "grad_norm": 0.6781792634826368, "learning_rate": 0.0005722600725434499, "loss": 3.8977, "step": 4109 }, { "epoch": 0.47, "grad_norm": 0.9040099166026978, "learning_rate": 0.0005720763217450443, "loss": 3.7057, "step": 4110 }, { "epoch": 0.47, "grad_norm": 1.0026310180830451, "learning_rate": 0.000571892561005084, "loss": 3.8525, "step": 4111 }, { "epoch": 0.47, "grad_norm": 0.7118106902114532, "learning_rate": 0.0005717087903489155, "loss": 4.2662, "step": 4112 }, { "epoch": 0.47, "grad_norm": 0.643109068047393, "learning_rate": 0.000571525009801886, "loss": 3.8007, "step": 4113 }, { "epoch": 0.47, "grad_norm": 0.8168321534780815, "learning_rate": 0.0005713412193893451, "loss": 3.8205, "step": 4114 }, { "epoch": 0.47, "grad_norm": 0.6281378436414965, "learning_rate": 0.0005711574191366427, "loss": 3.9952, "step": 4115 }, { "epoch": 0.47, "grad_norm": 0.806636119951407, "learning_rate": 0.0005709736090691305, "loss": 4.0388, "step": 4116 }, { "epoch": 0.47, "grad_norm": 0.683207608416473, "learning_rate": 0.0005707897892121621, "loss": 4.1477, "step": 4117 }, { "epoch": 0.47, "grad_norm": 0.6384674554310215, "learning_rate": 0.0005706059595910918, "loss": 3.8215, "step": 4118 }, { "epoch": 0.47, "grad_norm": 0.6345717555893181, "learning_rate": 0.0005704221202312748, "loss": 3.8842, "step": 4119 }, { "epoch": 0.47, "grad_norm": 1.0756279877630317, "learning_rate": 0.000570238271158069, "loss": 3.9364, "step": 4120 }, { "epoch": 0.47, "grad_norm": 0.6427335427696915, "learning_rate": 0.0005700544123968325, "loss": 4.0756, "step": 4121 }, { "epoch": 0.47, "grad_norm": 0.7550553529738694, "learning_rate": 0.0005698705439729251, "loss": 4.0977, "step": 4122 }, { "epoch": 0.47, "grad_norm": 0.6325033096333672, "learning_rate": 0.0005696866659117081, "loss": 4.0817, "step": 4123 }, { "epoch": 0.47, "grad_norm": 0.6352606555877425, "learning_rate": 0.0005695027782385438, "loss": 3.8634, "step": 4124 }, { "epoch": 0.47, "grad_norm": 0.6999266402013214, "learning_rate": 0.0005693188809787961, "loss": 3.9116, "step": 4125 }, { "epoch": 0.47, "grad_norm": 0.7393916470093013, "learning_rate": 0.00056913497415783, "loss": 4.0097, "step": 4126 }, { "epoch": 0.47, "grad_norm": 0.76617135339059, "learning_rate": 0.0005689510578010123, "loss": 3.6596, "step": 4127 }, { "epoch": 0.47, "grad_norm": 1.5492138193057108, "learning_rate": 0.0005687671319337103, "loss": 3.8509, "step": 4128 }, { "epoch": 0.47, "grad_norm": 0.6186959790885198, "learning_rate": 0.0005685831965812933, "loss": 3.9059, "step": 4129 }, { "epoch": 0.47, "grad_norm": 0.7307471122378822, "learning_rate": 0.0005683992517691318, "loss": 3.8521, "step": 4130 }, { "epoch": 0.47, "grad_norm": 0.7132614384152902, "learning_rate": 0.0005682152975225972, "loss": 4.1754, "step": 4131 }, { "epoch": 0.47, "grad_norm": 0.7797265096478379, "learning_rate": 0.0005680313338670627, "loss": 3.9919, "step": 4132 }, { "epoch": 0.47, "grad_norm": 0.7980813652788946, "learning_rate": 0.0005678473608279024, "loss": 3.6934, "step": 4133 }, { "epoch": 0.47, "grad_norm": 0.7132905638606477, "learning_rate": 0.000567663378430492, "loss": 3.905, "step": 4134 }, { "epoch": 0.47, "grad_norm": 0.6799074470765357, "learning_rate": 0.0005674793867002083, "loss": 4.077, "step": 4135 }, { "epoch": 0.47, "grad_norm": 0.6646376976737701, "learning_rate": 0.0005672953856624294, "loss": 3.9322, "step": 4136 }, { "epoch": 0.47, "grad_norm": 0.7525364463052419, "learning_rate": 0.000567111375342535, "loss": 3.6307, "step": 4137 }, { "epoch": 0.47, "grad_norm": 0.9293934622023164, "learning_rate": 0.0005669273557659055, "loss": 4.0277, "step": 4138 }, { "epoch": 0.47, "grad_norm": 0.7045722123452295, "learning_rate": 0.000566743326957923, "loss": 3.7057, "step": 4139 }, { "epoch": 0.47, "grad_norm": 0.8624459897354622, "learning_rate": 0.0005665592889439709, "loss": 3.9933, "step": 4140 }, { "epoch": 0.47, "grad_norm": 0.9451309662682881, "learning_rate": 0.0005663752417494334, "loss": 3.9795, "step": 4141 }, { "epoch": 0.47, "grad_norm": 0.8224199458234307, "learning_rate": 0.0005661911853996969, "loss": 4.0775, "step": 4142 }, { "epoch": 0.48, "grad_norm": 0.6529379837960625, "learning_rate": 0.0005660071199201479, "loss": 3.6285, "step": 4143 }, { "epoch": 0.48, "grad_norm": 0.7561638167677699, "learning_rate": 0.0005658230453361748, "loss": 3.9711, "step": 4144 }, { "epoch": 0.48, "grad_norm": 0.7829333533915648, "learning_rate": 0.0005656389616731675, "loss": 3.8266, "step": 4145 }, { "epoch": 0.48, "grad_norm": 0.7156766171385884, "learning_rate": 0.0005654548689565164, "loss": 4.0188, "step": 4146 }, { "epoch": 0.48, "grad_norm": 1.2517695464967091, "learning_rate": 0.000565270767211614, "loss": 3.9405, "step": 4147 }, { "epoch": 0.48, "grad_norm": 0.68593181778253, "learning_rate": 0.0005650866564638535, "loss": 3.9319, "step": 4148 }, { "epoch": 0.48, "grad_norm": 0.7410714117015248, "learning_rate": 0.0005649025367386292, "loss": 3.8439, "step": 4149 }, { "epoch": 0.48, "grad_norm": 0.7497270242293124, "learning_rate": 0.0005647184080613371, "loss": 3.7982, "step": 4150 }, { "epoch": 0.48, "grad_norm": 2.8976688785911695, "learning_rate": 0.0005645342704573744, "loss": 3.9652, "step": 4151 }, { "epoch": 0.48, "grad_norm": 1.1296531788752588, "learning_rate": 0.0005643501239521393, "loss": 3.8241, "step": 4152 }, { "epoch": 0.48, "grad_norm": 0.6470194991322522, "learning_rate": 0.0005641659685710309, "loss": 4.0277, "step": 4153 }, { "epoch": 0.48, "grad_norm": 0.8476661801654558, "learning_rate": 0.0005639818043394506, "loss": 4.0061, "step": 4154 }, { "epoch": 0.48, "grad_norm": 0.6771147675738165, "learning_rate": 0.0005637976312827998, "loss": 4.1007, "step": 4155 }, { "epoch": 0.48, "grad_norm": 0.692419064431505, "learning_rate": 0.000563613449426482, "loss": 3.939, "step": 4156 }, { "epoch": 0.48, "grad_norm": 1.1038128266556082, "learning_rate": 0.0005634292587959012, "loss": 3.8414, "step": 4157 }, { "epoch": 0.48, "grad_norm": 0.8268234639182892, "learning_rate": 0.0005632450594164635, "loss": 3.7544, "step": 4158 }, { "epoch": 0.48, "grad_norm": 0.7174093262790885, "learning_rate": 0.0005630608513135752, "loss": 3.9962, "step": 4159 }, { "epoch": 0.48, "grad_norm": 0.7243216341197899, "learning_rate": 0.0005628766345126446, "loss": 3.8701, "step": 4160 }, { "epoch": 0.48, "grad_norm": 0.7405087417770971, "learning_rate": 0.0005626924090390808, "loss": 3.8009, "step": 4161 }, { "epoch": 0.48, "grad_norm": 0.6456486288078073, "learning_rate": 0.0005625081749182942, "loss": 3.9917, "step": 4162 }, { "epoch": 0.48, "grad_norm": 0.8111748845946625, "learning_rate": 0.0005623239321756963, "loss": 4.115, "step": 4163 }, { "epoch": 0.48, "grad_norm": 0.7282251328817294, "learning_rate": 0.0005621396808367001, "loss": 3.8912, "step": 4164 }, { "epoch": 0.48, "grad_norm": 0.6855091964193534, "learning_rate": 0.0005619554209267193, "loss": 3.6085, "step": 4165 }, { "epoch": 0.48, "grad_norm": 0.9269545907850576, "learning_rate": 0.0005617711524711691, "loss": 3.8385, "step": 4166 }, { "epoch": 0.48, "grad_norm": 2.6175911143602524, "learning_rate": 0.0005615868754954661, "loss": 3.9545, "step": 4167 }, { "epoch": 0.48, "grad_norm": 1.1103577058248206, "learning_rate": 0.0005614025900250276, "loss": 3.7955, "step": 4168 }, { "epoch": 0.48, "grad_norm": 0.664757317860562, "learning_rate": 0.000561218296085272, "loss": 3.7481, "step": 4169 }, { "epoch": 0.48, "grad_norm": 0.6520435693282834, "learning_rate": 0.0005610339937016195, "loss": 3.8605, "step": 4170 }, { "epoch": 0.48, "grad_norm": 0.7865181742027115, "learning_rate": 0.0005608496828994911, "loss": 3.9462, "step": 4171 }, { "epoch": 0.48, "grad_norm": 0.8201219139604599, "learning_rate": 0.0005606653637043087, "loss": 3.7654, "step": 4172 }, { "epoch": 0.48, "grad_norm": 0.8766655116847327, "learning_rate": 0.0005604810361414958, "loss": 4.1486, "step": 4173 }, { "epoch": 0.48, "grad_norm": 0.7313593009147609, "learning_rate": 0.0005602967002364768, "loss": 3.5726, "step": 4174 }, { "epoch": 0.48, "grad_norm": 3.295467631178779, "learning_rate": 0.0005601123560146774, "loss": 3.7589, "step": 4175 }, { "epoch": 0.48, "grad_norm": 1.159446503321656, "learning_rate": 0.0005599280035015243, "loss": 3.7791, "step": 4176 }, { "epoch": 0.48, "grad_norm": 0.688679422203639, "learning_rate": 0.0005597436427224455, "loss": 3.7162, "step": 4177 }, { "epoch": 0.48, "grad_norm": 0.6432887500505773, "learning_rate": 0.00055955927370287, "loss": 3.9123, "step": 4178 }, { "epoch": 0.48, "grad_norm": 0.7172747979869595, "learning_rate": 0.000559374896468228, "loss": 3.8083, "step": 4179 }, { "epoch": 0.48, "grad_norm": 1.320637945085686, "learning_rate": 0.0005591905110439507, "loss": 4.0285, "step": 4180 }, { "epoch": 0.48, "grad_norm": 0.9348141219831981, "learning_rate": 0.0005590061174554708, "loss": 3.7304, "step": 4181 }, { "epoch": 0.48, "grad_norm": 2.1733826640485883, "learning_rate": 0.0005588217157282217, "loss": 3.895, "step": 4182 }, { "epoch": 0.48, "grad_norm": 0.647677457848519, "learning_rate": 0.0005586373058876383, "loss": 3.87, "step": 4183 }, { "epoch": 0.48, "grad_norm": 0.8538660188188185, "learning_rate": 0.0005584528879591562, "loss": 3.7059, "step": 4184 }, { "epoch": 0.48, "grad_norm": 10.578015485136753, "learning_rate": 0.0005582684619682123, "loss": 3.8197, "step": 4185 }, { "epoch": 0.48, "grad_norm": 0.8225594858126504, "learning_rate": 0.0005580840279402448, "loss": 3.8543, "step": 4186 }, { "epoch": 0.48, "grad_norm": 1.1412247465390828, "learning_rate": 0.000557899585900693, "loss": 3.7639, "step": 4187 }, { "epoch": 0.48, "grad_norm": 0.6122191691806615, "learning_rate": 0.0005577151358749968, "loss": 3.9094, "step": 4188 }, { "epoch": 0.48, "grad_norm": 0.7203590205760791, "learning_rate": 0.0005575306778885978, "loss": 3.6707, "step": 4189 }, { "epoch": 0.48, "grad_norm": 0.8425185037210182, "learning_rate": 0.0005573462119669386, "loss": 3.7694, "step": 4190 }, { "epoch": 0.48, "grad_norm": 1.6006313262159622, "learning_rate": 0.0005571617381354622, "loss": 3.7552, "step": 4191 }, { "epoch": 0.48, "grad_norm": 1.3111962589581583, "learning_rate": 0.0005569772564196139, "loss": 3.9519, "step": 4192 }, { "epoch": 0.48, "grad_norm": 0.8913698733860103, "learning_rate": 0.0005567927668448392, "loss": 4.0054, "step": 4193 }, { "epoch": 0.48, "grad_norm": 1.2750344281924373, "learning_rate": 0.0005566082694365847, "loss": 3.8583, "step": 4194 }, { "epoch": 0.48, "grad_norm": 0.6750789506649579, "learning_rate": 0.0005564237642202987, "loss": 3.8947, "step": 4195 }, { "epoch": 0.48, "grad_norm": 0.6098092691542901, "learning_rate": 0.0005562392512214299, "loss": 3.9312, "step": 4196 }, { "epoch": 0.48, "grad_norm": 0.7033392893536247, "learning_rate": 0.0005560547304654282, "loss": 3.6497, "step": 4197 }, { "epoch": 0.48, "grad_norm": 0.7026395681909104, "learning_rate": 0.0005558702019777452, "loss": 3.6493, "step": 4198 }, { "epoch": 0.48, "grad_norm": 0.7565622061085707, "learning_rate": 0.0005556856657838327, "loss": 3.7711, "step": 4199 }, { "epoch": 0.48, "grad_norm": 0.8189698067726675, "learning_rate": 0.000555501121909144, "loss": 3.6615, "step": 4200 }, { "epoch": 0.48, "grad_norm": 0.7074839863033818, "learning_rate": 0.0005553165703791335, "loss": 3.9459, "step": 4201 }, { "epoch": 0.48, "grad_norm": 0.9387422967146135, "learning_rate": 0.0005551320112192567, "loss": 3.9128, "step": 4202 }, { "epoch": 0.48, "grad_norm": 0.7250268368409484, "learning_rate": 0.0005549474444549698, "loss": 3.7162, "step": 4203 }, { "epoch": 0.48, "grad_norm": 1.6463227462711834, "learning_rate": 0.0005547628701117303, "loss": 3.9438, "step": 4204 }, { "epoch": 0.48, "grad_norm": 0.9805071822219112, "learning_rate": 0.0005545782882149968, "loss": 4.0811, "step": 4205 }, { "epoch": 0.48, "grad_norm": 0.8786509568891585, "learning_rate": 0.0005543936987902287, "loss": 3.8913, "step": 4206 }, { "epoch": 0.48, "grad_norm": 0.7821324398222718, "learning_rate": 0.0005542091018628867, "loss": 3.9467, "step": 4207 }, { "epoch": 0.48, "grad_norm": 0.6317290535521419, "learning_rate": 0.0005540244974584325, "loss": 4.0567, "step": 4208 }, { "epoch": 0.48, "grad_norm": 1.247545670640134, "learning_rate": 0.0005538398856023285, "loss": 3.8088, "step": 4209 }, { "epoch": 0.48, "grad_norm": 1.0147232319991144, "learning_rate": 0.0005536552663200387, "loss": 3.8911, "step": 4210 }, { "epoch": 0.48, "grad_norm": 0.7657883990090287, "learning_rate": 0.0005534706396370277, "loss": 3.8647, "step": 4211 }, { "epoch": 0.48, "grad_norm": 0.7388793082355344, "learning_rate": 0.0005532860055787611, "loss": 3.6911, "step": 4212 }, { "epoch": 0.48, "grad_norm": 0.7118742717327231, "learning_rate": 0.0005531013641707059, "loss": 4.0844, "step": 4213 }, { "epoch": 0.48, "grad_norm": 0.7356277025586998, "learning_rate": 0.0005529167154383296, "loss": 3.7661, "step": 4214 }, { "epoch": 0.48, "grad_norm": 3.8897015085591513, "learning_rate": 0.0005527320594071012, "loss": 4.0792, "step": 4215 }, { "epoch": 0.48, "grad_norm": 0.7212364154404345, "learning_rate": 0.0005525473961024901, "loss": 3.8941, "step": 4216 }, { "epoch": 0.48, "grad_norm": 1.8467740769339882, "learning_rate": 0.0005523627255499677, "loss": 3.9265, "step": 4217 }, { "epoch": 0.48, "grad_norm": 0.7633844151514193, "learning_rate": 0.0005521780477750054, "loss": 3.8393, "step": 4218 }, { "epoch": 0.48, "grad_norm": 0.6522275761736759, "learning_rate": 0.0005519933628030757, "loss": 3.8258, "step": 4219 }, { "epoch": 0.48, "grad_norm": 0.7508381841006275, "learning_rate": 0.000551808670659653, "loss": 3.8636, "step": 4220 }, { "epoch": 0.48, "grad_norm": 0.7794570144976212, "learning_rate": 0.0005516239713702116, "loss": 3.4696, "step": 4221 }, { "epoch": 0.48, "grad_norm": 0.6937909877352628, "learning_rate": 0.0005514392649602273, "loss": 3.9134, "step": 4222 }, { "epoch": 0.48, "grad_norm": 0.706813703719516, "learning_rate": 0.0005512545514551772, "loss": 3.8676, "step": 4223 }, { "epoch": 0.48, "grad_norm": 0.6805093413853006, "learning_rate": 0.0005510698308805385, "loss": 3.7448, "step": 4224 }, { "epoch": 0.48, "grad_norm": 0.6477899805012503, "learning_rate": 0.0005508851032617902, "loss": 3.7749, "step": 4225 }, { "epoch": 0.48, "grad_norm": 0.7684325969982762, "learning_rate": 0.0005507003686244119, "loss": 3.7617, "step": 4226 }, { "epoch": 0.48, "grad_norm": 1.2925032713981046, "learning_rate": 0.0005505156269938841, "loss": 3.9198, "step": 4227 }, { "epoch": 0.48, "grad_norm": 0.8856263559031764, "learning_rate": 0.0005503308783956886, "loss": 3.6596, "step": 4228 }, { "epoch": 0.48, "grad_norm": 1.2638562589909192, "learning_rate": 0.0005501461228553075, "loss": 3.8036, "step": 4229 }, { "epoch": 0.49, "grad_norm": 1.0370183373814548, "learning_rate": 0.000549961360398225, "loss": 4.1053, "step": 4230 }, { "epoch": 0.49, "grad_norm": 0.6646943677610788, "learning_rate": 0.0005497765910499249, "loss": 4.0116, "step": 4231 }, { "epoch": 0.49, "grad_norm": 0.7080819839868919, "learning_rate": 0.0005495918148358931, "loss": 3.9024, "step": 4232 }, { "epoch": 0.49, "grad_norm": 1.1166588314779935, "learning_rate": 0.0005494070317816157, "loss": 3.9844, "step": 4233 }, { "epoch": 0.49, "grad_norm": 0.703247472130302, "learning_rate": 0.00054922224191258, "loss": 3.7608, "step": 4234 }, { "epoch": 0.49, "grad_norm": 0.7422774828944972, "learning_rate": 0.0005490374452542743, "loss": 3.8575, "step": 4235 }, { "epoch": 0.49, "grad_norm": 0.691512963002237, "learning_rate": 0.0005488526418321877, "loss": 3.8403, "step": 4236 }, { "epoch": 0.49, "grad_norm": 1.4109280816234409, "learning_rate": 0.0005486678316718106, "loss": 3.8098, "step": 4237 }, { "epoch": 0.49, "grad_norm": 1.3241521759242338, "learning_rate": 0.0005484830147986336, "loss": 3.7438, "step": 4238 }, { "epoch": 0.49, "grad_norm": 0.7182256746273659, "learning_rate": 0.0005482981912381488, "loss": 3.9354, "step": 4239 }, { "epoch": 0.49, "grad_norm": 0.7208668329725527, "learning_rate": 0.0005481133610158494, "loss": 3.8039, "step": 4240 }, { "epoch": 0.49, "grad_norm": 0.7751535930013567, "learning_rate": 0.0005479285241572288, "loss": 3.6738, "step": 4241 }, { "epoch": 0.49, "grad_norm": 0.7721998400869257, "learning_rate": 0.000547743680687782, "loss": 4.0421, "step": 4242 }, { "epoch": 0.49, "grad_norm": 0.7535008101039398, "learning_rate": 0.0005475588306330046, "loss": 3.8513, "step": 4243 }, { "epoch": 0.49, "grad_norm": 0.6726846995361984, "learning_rate": 0.0005473739740183928, "loss": 3.8357, "step": 4244 }, { "epoch": 0.49, "grad_norm": 0.6388302450030526, "learning_rate": 0.0005471891108694446, "loss": 3.6509, "step": 4245 }, { "epoch": 0.49, "grad_norm": 1.051833638847868, "learning_rate": 0.0005470042412116579, "loss": 3.8951, "step": 4246 }, { "epoch": 0.49, "grad_norm": 0.6622886243781417, "learning_rate": 0.0005468193650705321, "loss": 3.9558, "step": 4247 }, { "epoch": 0.49, "grad_norm": 0.6981647601685309, "learning_rate": 0.0005466344824715674, "loss": 3.7269, "step": 4248 }, { "epoch": 0.49, "grad_norm": 0.6338424392291375, "learning_rate": 0.0005464495934402648, "loss": 4.0732, "step": 4249 }, { "epoch": 0.49, "grad_norm": 2.0251233673452567, "learning_rate": 0.0005462646980021262, "loss": 4.0006, "step": 4250 }, { "epoch": 0.49, "grad_norm": 0.6826347438279452, "learning_rate": 0.0005460797961826542, "loss": 3.9917, "step": 4251 }, { "epoch": 0.49, "grad_norm": 0.6295712372740135, "learning_rate": 0.000545894888007353, "loss": 3.7748, "step": 4252 }, { "epoch": 0.49, "grad_norm": 0.8254276402154788, "learning_rate": 0.0005457099735017267, "loss": 4.103, "step": 4253 }, { "epoch": 0.49, "grad_norm": 0.7133225056219541, "learning_rate": 0.0005455250526912807, "loss": 3.8597, "step": 4254 }, { "epoch": 0.49, "grad_norm": 0.7345046223653513, "learning_rate": 0.0005453401256015218, "loss": 3.9996, "step": 4255 }, { "epoch": 0.49, "grad_norm": 0.701354541075247, "learning_rate": 0.0005451551922579566, "loss": 3.913, "step": 4256 }, { "epoch": 0.49, "grad_norm": 0.7286416379073204, "learning_rate": 0.0005449702526860934, "loss": 4.1452, "step": 4257 }, { "epoch": 0.49, "grad_norm": 0.6855426703719808, "learning_rate": 0.0005447853069114411, "loss": 3.6099, "step": 4258 }, { "epoch": 0.49, "grad_norm": 0.7490191288016946, "learning_rate": 0.0005446003549595093, "loss": 3.9763, "step": 4259 }, { "epoch": 0.49, "grad_norm": 1.2921010113006524, "learning_rate": 0.0005444153968558088, "loss": 3.8563, "step": 4260 }, { "epoch": 0.49, "grad_norm": 0.7526760685771456, "learning_rate": 0.0005442304326258508, "loss": 3.7003, "step": 4261 }, { "epoch": 0.49, "grad_norm": 0.6482061232933054, "learning_rate": 0.000544045462295148, "loss": 3.7068, "step": 4262 }, { "epoch": 0.49, "grad_norm": 0.6770558559343279, "learning_rate": 0.000543860485889213, "loss": 4.0117, "step": 4263 }, { "epoch": 0.49, "grad_norm": 0.8127331749384585, "learning_rate": 0.00054367550343356, "loss": 3.6717, "step": 4264 }, { "epoch": 0.49, "grad_norm": 0.8934266624088781, "learning_rate": 0.000543490514953704, "loss": 3.79, "step": 4265 }, { "epoch": 0.49, "grad_norm": 0.657391726017079, "learning_rate": 0.0005433055204751604, "loss": 3.8838, "step": 4266 }, { "epoch": 0.49, "grad_norm": 0.7038043328528225, "learning_rate": 0.0005431205200234457, "loss": 3.9341, "step": 4267 }, { "epoch": 0.49, "grad_norm": 0.6275531314048974, "learning_rate": 0.0005429355136240773, "loss": 3.7322, "step": 4268 }, { "epoch": 0.49, "grad_norm": 0.7247376126119753, "learning_rate": 0.0005427505013025731, "loss": 4.0391, "step": 4269 }, { "epoch": 0.49, "grad_norm": 0.660749010897533, "learning_rate": 0.0005425654830844523, "loss": 3.8773, "step": 4270 }, { "epoch": 0.49, "grad_norm": 0.8046728236573004, "learning_rate": 0.0005423804589952344, "loss": 4.1054, "step": 4271 }, { "epoch": 0.49, "grad_norm": 0.7670295302155274, "learning_rate": 0.0005421954290604399, "loss": 3.9025, "step": 4272 }, { "epoch": 0.49, "grad_norm": 0.6306089240098544, "learning_rate": 0.0005420103933055906, "loss": 3.9606, "step": 4273 }, { "epoch": 0.49, "grad_norm": 1.3720089677340093, "learning_rate": 0.0005418253517562079, "loss": 3.9228, "step": 4274 }, { "epoch": 0.49, "grad_norm": 0.9713300686462119, "learning_rate": 0.0005416403044378156, "loss": 3.5703, "step": 4275 }, { "epoch": 0.49, "grad_norm": 0.6366394894029324, "learning_rate": 0.0005414552513759368, "loss": 3.8064, "step": 4276 }, { "epoch": 0.49, "grad_norm": 0.7714979718363879, "learning_rate": 0.0005412701925960964, "loss": 3.7883, "step": 4277 }, { "epoch": 0.49, "grad_norm": 1.0252723693542565, "learning_rate": 0.0005410851281238195, "loss": 3.8399, "step": 4278 }, { "epoch": 0.49, "grad_norm": 1.666272304038758, "learning_rate": 0.0005409000579846324, "loss": 3.7148, "step": 4279 }, { "epoch": 0.49, "grad_norm": 0.7200249660558814, "learning_rate": 0.0005407149822040619, "loss": 3.7106, "step": 4280 }, { "epoch": 0.49, "grad_norm": 0.8751406463046254, "learning_rate": 0.0005405299008076357, "loss": 3.8723, "step": 4281 }, { "epoch": 0.49, "grad_norm": 1.1178489768236044, "learning_rate": 0.0005403448138208823, "loss": 3.8026, "step": 4282 }, { "epoch": 0.49, "grad_norm": 0.9708300514218405, "learning_rate": 0.0005401597212693308, "loss": 3.7588, "step": 4283 }, { "epoch": 0.49, "grad_norm": 0.7367187280982936, "learning_rate": 0.0005399746231785113, "loss": 3.8241, "step": 4284 }, { "epoch": 0.49, "grad_norm": 1.5849456134150224, "learning_rate": 0.0005397895195739545, "loss": 4.0618, "step": 4285 }, { "epoch": 0.49, "grad_norm": 0.6902935789806055, "learning_rate": 0.0005396044104811921, "loss": 3.7287, "step": 4286 }, { "epoch": 0.49, "grad_norm": 0.9415839753589215, "learning_rate": 0.000539419295925756, "loss": 3.7814, "step": 4287 }, { "epoch": 0.49, "grad_norm": 0.7280940653030167, "learning_rate": 0.0005392341759331795, "loss": 3.8307, "step": 4288 }, { "epoch": 0.49, "grad_norm": 0.7630692801878005, "learning_rate": 0.0005390490505289962, "loss": 3.7742, "step": 4289 }, { "epoch": 0.49, "grad_norm": 0.8073473299226849, "learning_rate": 0.0005388639197387409, "loss": 3.96, "step": 4290 }, { "epoch": 0.49, "grad_norm": 0.631193663644436, "learning_rate": 0.0005386787835879486, "loss": 3.8134, "step": 4291 }, { "epoch": 0.49, "grad_norm": 0.6708700186613333, "learning_rate": 0.0005384936421021553, "loss": 3.9932, "step": 4292 }, { "epoch": 0.49, "grad_norm": 0.7631162051964475, "learning_rate": 0.0005383084953068981, "loss": 3.9015, "step": 4293 }, { "epoch": 0.49, "grad_norm": 0.7479184728744441, "learning_rate": 0.0005381233432277139, "loss": 3.7861, "step": 4294 }, { "epoch": 0.49, "grad_norm": 0.6758725049254787, "learning_rate": 0.0005379381858901413, "loss": 3.8973, "step": 4295 }, { "epoch": 0.49, "grad_norm": 0.7868253259816288, "learning_rate": 0.0005377530233197191, "loss": 4.0599, "step": 4296 }, { "epoch": 0.49, "grad_norm": 1.5126624415575685, "learning_rate": 0.000537567855541987, "loss": 3.9736, "step": 4297 }, { "epoch": 0.49, "grad_norm": 0.6513445278387031, "learning_rate": 0.0005373826825824854, "loss": 4.0385, "step": 4298 }, { "epoch": 0.49, "grad_norm": 0.9003586690033151, "learning_rate": 0.0005371975044667553, "loss": 4.0622, "step": 4299 }, { "epoch": 0.49, "grad_norm": 0.8794015941280305, "learning_rate": 0.0005370123212203384, "loss": 4.0212, "step": 4300 }, { "epoch": 0.49, "grad_norm": 0.6939371300975568, "learning_rate": 0.0005368271328687774, "loss": 3.8855, "step": 4301 }, { "epoch": 0.49, "grad_norm": 1.3268719449324284, "learning_rate": 0.0005366419394376154, "loss": 3.8987, "step": 4302 }, { "epoch": 0.49, "grad_norm": 0.6270866732204826, "learning_rate": 0.0005364567409523963, "loss": 3.9043, "step": 4303 }, { "epoch": 0.49, "grad_norm": 0.8666200529812793, "learning_rate": 0.0005362715374386646, "loss": 3.7395, "step": 4304 }, { "epoch": 0.49, "grad_norm": 1.4895891808794974, "learning_rate": 0.0005360863289219659, "loss": 4.0658, "step": 4305 }, { "epoch": 0.49, "grad_norm": 2.0635058672501643, "learning_rate": 0.000535901115427846, "loss": 3.9612, "step": 4306 }, { "epoch": 0.49, "grad_norm": 0.6587120406014885, "learning_rate": 0.0005357158969818514, "loss": 3.7911, "step": 4307 }, { "epoch": 0.49, "grad_norm": 0.9344782274964042, "learning_rate": 0.0005355306736095298, "loss": 3.8643, "step": 4308 }, { "epoch": 0.49, "grad_norm": 0.645740743247194, "learning_rate": 0.000535345445336429, "loss": 3.6671, "step": 4309 }, { "epoch": 0.49, "grad_norm": 0.7696586061278271, "learning_rate": 0.0005351602121880976, "loss": 3.8968, "step": 4310 }, { "epoch": 0.49, "grad_norm": 0.7745613244563733, "learning_rate": 0.0005349749741900853, "loss": 3.8705, "step": 4311 }, { "epoch": 0.49, "grad_norm": 0.746637791896984, "learning_rate": 0.0005347897313679419, "loss": 4.2019, "step": 4312 }, { "epoch": 0.49, "grad_norm": 0.680785652713342, "learning_rate": 0.0005346044837472182, "loss": 3.8401, "step": 4313 }, { "epoch": 0.49, "grad_norm": 0.7498587783260213, "learning_rate": 0.0005344192313534657, "loss": 3.8545, "step": 4314 }, { "epoch": 0.49, "grad_norm": 0.7006211735775378, "learning_rate": 0.0005342339742122363, "loss": 4.068, "step": 4315 }, { "epoch": 0.49, "grad_norm": 0.8338531933973969, "learning_rate": 0.0005340487123490826, "loss": 3.9449, "step": 4316 }, { "epoch": 0.5, "grad_norm": 0.6427850983150066, "learning_rate": 0.0005338634457895582, "loss": 3.8072, "step": 4317 }, { "epoch": 0.5, "grad_norm": 0.7023389511454344, "learning_rate": 0.000533678174559217, "loss": 3.6633, "step": 4318 }, { "epoch": 0.5, "grad_norm": 0.7225056432792876, "learning_rate": 0.0005334928986836133, "loss": 3.9289, "step": 4319 }, { "epoch": 0.5, "grad_norm": 0.7194347539523325, "learning_rate": 0.000533307618188303, "loss": 4.1196, "step": 4320 }, { "epoch": 0.5, "grad_norm": 0.6602833915597414, "learning_rate": 0.0005331223330988414, "loss": 3.8519, "step": 4321 }, { "epoch": 0.5, "grad_norm": 0.7060487871432689, "learning_rate": 0.0005329370434407854, "loss": 3.7306, "step": 4322 }, { "epoch": 0.5, "grad_norm": 0.6856790847146028, "learning_rate": 0.0005327517492396922, "loss": 3.9753, "step": 4323 }, { "epoch": 0.5, "grad_norm": 0.7323074954869072, "learning_rate": 0.0005325664505211194, "loss": 4.0269, "step": 4324 }, { "epoch": 0.5, "grad_norm": 1.5496071919459014, "learning_rate": 0.0005323811473106256, "loss": 3.846, "step": 4325 }, { "epoch": 0.5, "grad_norm": 0.7143079250446023, "learning_rate": 0.0005321958396337696, "loss": 3.9955, "step": 4326 }, { "epoch": 0.5, "grad_norm": 0.7710088189758366, "learning_rate": 0.0005320105275161115, "loss": 3.9299, "step": 4327 }, { "epoch": 0.5, "grad_norm": 1.0122304602470806, "learning_rate": 0.0005318252109832111, "loss": 3.8672, "step": 4328 }, { "epoch": 0.5, "grad_norm": 0.7900544810101969, "learning_rate": 0.0005316398900606296, "loss": 3.7841, "step": 4329 }, { "epoch": 0.5, "grad_norm": 1.4573409150841719, "learning_rate": 0.0005314545647739283, "loss": 3.8774, "step": 4330 }, { "epoch": 0.5, "grad_norm": 0.6460043006926358, "learning_rate": 0.0005312692351486693, "loss": 3.6037, "step": 4331 }, { "epoch": 0.5, "grad_norm": 0.6952721559655148, "learning_rate": 0.0005310839012104155, "loss": 3.8973, "step": 4332 }, { "epoch": 0.5, "grad_norm": 0.6667537660888871, "learning_rate": 0.00053089856298473, "loss": 3.7834, "step": 4333 }, { "epoch": 0.5, "grad_norm": 1.7703249724807752, "learning_rate": 0.0005307132204971768, "loss": 3.7442, "step": 4334 }, { "epoch": 0.5, "grad_norm": 0.690704215550814, "learning_rate": 0.00053052787377332, "loss": 3.6913, "step": 4335 }, { "epoch": 0.5, "grad_norm": 0.5950153192166955, "learning_rate": 0.0005303425228387251, "loss": 3.9633, "step": 4336 }, { "epoch": 0.5, "grad_norm": 0.6793319964571636, "learning_rate": 0.0005301571677189576, "loss": 3.7317, "step": 4337 }, { "epoch": 0.5, "grad_norm": 1.1255877520892084, "learning_rate": 0.0005299718084395837, "loss": 3.8928, "step": 4338 }, { "epoch": 0.5, "grad_norm": 0.8681617543612836, "learning_rate": 0.00052978644502617, "loss": 4.0361, "step": 4339 }, { "epoch": 0.5, "grad_norm": 0.6411618231870494, "learning_rate": 0.0005296010775042841, "loss": 3.7235, "step": 4340 }, { "epoch": 0.5, "grad_norm": 0.6639776224385691, "learning_rate": 0.0005294157058994936, "loss": 4.0082, "step": 4341 }, { "epoch": 0.5, "grad_norm": 0.792819643737473, "learning_rate": 0.0005292303302373674, "loss": 3.9906, "step": 4342 }, { "epoch": 0.5, "grad_norm": 0.6317125014254703, "learning_rate": 0.0005290449505434744, "loss": 3.914, "step": 4343 }, { "epoch": 0.5, "grad_norm": 0.6654505862068125, "learning_rate": 0.0005288595668433839, "loss": 3.7816, "step": 4344 }, { "epoch": 0.5, "grad_norm": 0.7597361594442525, "learning_rate": 0.0005286741791626664, "loss": 3.6733, "step": 4345 }, { "epoch": 0.5, "grad_norm": 0.6663607010879168, "learning_rate": 0.0005284887875268925, "loss": 3.8469, "step": 4346 }, { "epoch": 0.5, "grad_norm": 10.807512405626634, "learning_rate": 0.0005283033919616331, "loss": 4.0914, "step": 4347 }, { "epoch": 0.5, "grad_norm": 0.6687774837686334, "learning_rate": 0.0005281179924924608, "loss": 3.6541, "step": 4348 }, { "epoch": 0.5, "grad_norm": 0.6172431212845514, "learning_rate": 0.000527932589144947, "loss": 3.8396, "step": 4349 }, { "epoch": 0.5, "grad_norm": 0.6927991894368641, "learning_rate": 0.0005277471819446651, "loss": 4.1757, "step": 4350 }, { "epoch": 0.5, "grad_norm": 0.6831443862297305, "learning_rate": 0.0005275617709171882, "loss": 3.8422, "step": 4351 }, { "epoch": 0.5, "grad_norm": 0.6359879851521837, "learning_rate": 0.0005273763560880907, "loss": 3.9217, "step": 4352 }, { "epoch": 0.5, "grad_norm": 0.8195417733225039, "learning_rate": 0.0005271909374829466, "loss": 3.8677, "step": 4353 }, { "epoch": 0.5, "grad_norm": 0.7417967722487566, "learning_rate": 0.0005270055151273309, "loss": 3.9606, "step": 4354 }, { "epoch": 0.5, "grad_norm": 0.7735729746555182, "learning_rate": 0.0005268200890468192, "loss": 3.8473, "step": 4355 }, { "epoch": 0.5, "grad_norm": 0.6376697320994279, "learning_rate": 0.0005266346592669875, "loss": 3.9098, "step": 4356 }, { "epoch": 0.5, "grad_norm": 0.6035067298075151, "learning_rate": 0.0005264492258134121, "loss": 3.7546, "step": 4357 }, { "epoch": 0.5, "grad_norm": 1.4371075993701425, "learning_rate": 0.0005262637887116703, "loss": 3.8696, "step": 4358 }, { "epoch": 0.5, "grad_norm": 0.6754423690987267, "learning_rate": 0.0005260783479873396, "loss": 3.8452, "step": 4359 }, { "epoch": 0.5, "grad_norm": 0.6215521372594712, "learning_rate": 0.0005258929036659976, "loss": 3.8333, "step": 4360 }, { "epoch": 0.5, "grad_norm": 2.078635638241706, "learning_rate": 0.0005257074557732232, "loss": 3.8039, "step": 4361 }, { "epoch": 0.5, "grad_norm": 0.8918820262386129, "learning_rate": 0.0005255220043345956, "loss": 4.0359, "step": 4362 }, { "epoch": 0.5, "grad_norm": 0.9022788376750367, "learning_rate": 0.0005253365493756936, "loss": 3.9064, "step": 4363 }, { "epoch": 0.5, "grad_norm": 0.7772980739022383, "learning_rate": 0.0005251510909220977, "loss": 3.8393, "step": 4364 }, { "epoch": 0.5, "grad_norm": 0.7747700915929809, "learning_rate": 0.0005249656289993883, "loss": 4.0858, "step": 4365 }, { "epoch": 0.5, "grad_norm": 0.9550961665566325, "learning_rate": 0.0005247801636331462, "loss": 3.9227, "step": 4366 }, { "epoch": 0.5, "grad_norm": 0.9577926875568097, "learning_rate": 0.000524594694848953, "loss": 3.9543, "step": 4367 }, { "epoch": 0.5, "grad_norm": 1.917705450548486, "learning_rate": 0.0005244092226723903, "loss": 3.9743, "step": 4368 }, { "epoch": 0.5, "grad_norm": 0.7804998060102647, "learning_rate": 0.0005242237471290407, "loss": 4.0892, "step": 4369 }, { "epoch": 0.5, "grad_norm": 0.7095561562093544, "learning_rate": 0.0005240382682444868, "loss": 3.8159, "step": 4370 }, { "epoch": 0.5, "grad_norm": 4.464766020809735, "learning_rate": 0.0005238527860443122, "loss": 3.8298, "step": 4371 }, { "epoch": 0.5, "grad_norm": 0.5786097376596528, "learning_rate": 0.0005236673005541003, "loss": 3.9793, "step": 4372 }, { "epoch": 0.5, "grad_norm": 0.7943485993626705, "learning_rate": 0.0005234818117994355, "loss": 3.7228, "step": 4373 }, { "epoch": 0.5, "grad_norm": 0.7957160522801602, "learning_rate": 0.0005232963198059024, "loss": 3.8363, "step": 4374 }, { "epoch": 0.5, "grad_norm": 0.6943929208890544, "learning_rate": 0.000523110824599086, "loss": 3.8449, "step": 4375 }, { "epoch": 0.5, "grad_norm": 0.763727311840107, "learning_rate": 0.0005229253262045719, "loss": 3.948, "step": 4376 }, { "epoch": 0.5, "grad_norm": 0.6114901279774198, "learning_rate": 0.000522739824647946, "loss": 3.7401, "step": 4377 }, { "epoch": 0.5, "grad_norm": 1.1120212247429346, "learning_rate": 0.0005225543199547948, "loss": 3.8469, "step": 4378 }, { "epoch": 0.5, "grad_norm": 0.625150512341303, "learning_rate": 0.0005223688121507051, "loss": 3.7646, "step": 4379 }, { "epoch": 0.5, "grad_norm": 0.7877588633148302, "learning_rate": 0.0005221833012612642, "loss": 3.7804, "step": 4380 }, { "epoch": 0.5, "grad_norm": 0.8316543642278383, "learning_rate": 0.0005219977873120596, "loss": 3.7654, "step": 4381 }, { "epoch": 0.5, "grad_norm": 1.3286621167976347, "learning_rate": 0.0005218122703286797, "loss": 3.6907, "step": 4382 }, { "epoch": 0.5, "grad_norm": 0.969967442605985, "learning_rate": 0.0005216267503367127, "loss": 3.8867, "step": 4383 }, { "epoch": 0.5, "grad_norm": 1.0060951749786806, "learning_rate": 0.0005214412273617478, "loss": 3.9185, "step": 4384 }, { "epoch": 0.5, "grad_norm": 0.9360136332954375, "learning_rate": 0.0005212557014293744, "loss": 3.9445, "step": 4385 }, { "epoch": 0.5, "grad_norm": 0.8261770178071651, "learning_rate": 0.0005210701725651821, "loss": 3.9077, "step": 4386 }, { "epoch": 0.5, "grad_norm": 0.7619150613477956, "learning_rate": 0.0005208846407947612, "loss": 3.7018, "step": 4387 }, { "epoch": 0.5, "grad_norm": 0.8274379029709685, "learning_rate": 0.000520699106143702, "loss": 3.9354, "step": 4388 }, { "epoch": 0.5, "grad_norm": 3.6479426994225155, "learning_rate": 0.0005205135686375958, "loss": 3.6637, "step": 4389 }, { "epoch": 0.5, "grad_norm": 0.6633710468941912, "learning_rate": 0.0005203280283020338, "loss": 3.704, "step": 4390 }, { "epoch": 0.5, "grad_norm": 0.6782065008042092, "learning_rate": 0.0005201424851626078, "loss": 4.1121, "step": 4391 }, { "epoch": 0.5, "grad_norm": 0.6398829488367108, "learning_rate": 0.00051995693924491, "loss": 3.6957, "step": 4392 }, { "epoch": 0.5, "grad_norm": 0.75088011482761, "learning_rate": 0.0005197713905745328, "loss": 3.8235, "step": 4393 }, { "epoch": 0.5, "grad_norm": 0.8364889522861264, "learning_rate": 0.0005195858391770689, "loss": 4.0245, "step": 4394 }, { "epoch": 0.5, "grad_norm": 0.6648231504037965, "learning_rate": 0.0005194002850781122, "loss": 4.056, "step": 4395 }, { "epoch": 0.5, "grad_norm": 0.6278183736103358, "learning_rate": 0.0005192147283032557, "loss": 3.7125, "step": 4396 }, { "epoch": 0.5, "grad_norm": 0.6255510716316568, "learning_rate": 0.000519029168878094, "loss": 4.028, "step": 4397 }, { "epoch": 0.5, "grad_norm": 1.4154381428137544, "learning_rate": 0.000518843606828221, "loss": 3.6961, "step": 4398 }, { "epoch": 0.5, "grad_norm": 1.2808690716253337, "learning_rate": 0.0005186580421792315, "loss": 3.9504, "step": 4399 }, { "epoch": 0.5, "grad_norm": 3.3701635877610516, "learning_rate": 0.0005184724749567209, "loss": 3.932, "step": 4400 }, { "epoch": 0.5, "grad_norm": 0.8255134122372276, "learning_rate": 0.0005182869051862844, "loss": 3.8076, "step": 4401 }, { "epoch": 0.5, "grad_norm": 0.9453635244419403, "learning_rate": 0.0005181013328935181, "loss": 3.8525, "step": 4402 }, { "epoch": 0.5, "grad_norm": 0.8545207309657498, "learning_rate": 0.0005179157581040178, "loss": 3.9571, "step": 4403 }, { "epoch": 0.5, "grad_norm": 0.8670517059933748, "learning_rate": 0.0005177301808433802, "loss": 3.7181, "step": 4404 }, { "epoch": 0.51, "grad_norm": 0.6963259831461558, "learning_rate": 0.0005175446011372022, "loss": 3.9166, "step": 4405 }, { "epoch": 0.51, "grad_norm": 1.5041071762931801, "learning_rate": 0.0005173590190110808, "loss": 3.8618, "step": 4406 }, { "epoch": 0.51, "grad_norm": 2.093959416955285, "learning_rate": 0.0005171734344906136, "loss": 3.7172, "step": 4407 }, { "epoch": 0.51, "grad_norm": 0.6848089263403223, "learning_rate": 0.0005169878476013986, "loss": 3.8264, "step": 4408 }, { "epoch": 0.51, "grad_norm": 0.9554274853374514, "learning_rate": 0.0005168022583690339, "loss": 3.7125, "step": 4409 }, { "epoch": 0.51, "grad_norm": 0.7872795307688186, "learning_rate": 0.0005166166668191176, "loss": 4.0025, "step": 4410 }, { "epoch": 0.51, "grad_norm": 0.8734150245931039, "learning_rate": 0.0005164310729772492, "loss": 4.0752, "step": 4411 }, { "epoch": 0.51, "grad_norm": 0.6997436785969612, "learning_rate": 0.0005162454768690274, "loss": 3.9366, "step": 4412 }, { "epoch": 0.51, "grad_norm": 0.7078611256730544, "learning_rate": 0.0005160598785200515, "loss": 3.7788, "step": 4413 }, { "epoch": 0.51, "grad_norm": 0.7281303409151926, "learning_rate": 0.0005158742779559217, "loss": 3.9342, "step": 4414 }, { "epoch": 0.51, "grad_norm": 0.6291251776351717, "learning_rate": 0.0005156886752022379, "loss": 3.7555, "step": 4415 }, { "epoch": 0.51, "grad_norm": 0.7449039865740789, "learning_rate": 0.0005155030702846002, "loss": 3.869, "step": 4416 }, { "epoch": 0.51, "grad_norm": 0.6424272326682253, "learning_rate": 0.0005153174632286097, "loss": 3.8629, "step": 4417 }, { "epoch": 0.51, "grad_norm": 0.6627893803731536, "learning_rate": 0.000515131854059867, "loss": 3.7169, "step": 4418 }, { "epoch": 0.51, "grad_norm": 0.6324464890217042, "learning_rate": 0.0005149462428039734, "loss": 3.8907, "step": 4419 }, { "epoch": 0.51, "grad_norm": 1.3359302938983024, "learning_rate": 0.0005147606294865307, "loss": 3.7251, "step": 4420 }, { "epoch": 0.51, "grad_norm": 0.6351025800189747, "learning_rate": 0.0005145750141331405, "loss": 3.8049, "step": 4421 }, { "epoch": 0.51, "grad_norm": 0.7154992409020873, "learning_rate": 0.0005143893967694047, "loss": 3.6793, "step": 4422 }, { "epoch": 0.51, "grad_norm": 0.9175846169329397, "learning_rate": 0.0005142037774209262, "loss": 3.4372, "step": 4423 }, { "epoch": 0.51, "grad_norm": 0.7037982628831686, "learning_rate": 0.0005140181561133072, "loss": 4.0483, "step": 4424 }, { "epoch": 0.51, "grad_norm": 0.6732485928179146, "learning_rate": 0.0005138325328721507, "loss": 3.8031, "step": 4425 }, { "epoch": 0.51, "grad_norm": 1.9729577124680968, "learning_rate": 0.00051364690772306, "loss": 3.8098, "step": 4426 }, { "epoch": 0.51, "grad_norm": 0.6828161538434582, "learning_rate": 0.0005134612806916387, "loss": 3.8509, "step": 4427 }, { "epoch": 0.51, "grad_norm": 0.5720333492684491, "learning_rate": 0.0005132756518034901, "loss": 3.9235, "step": 4428 }, { "epoch": 0.51, "grad_norm": 0.7769437903719367, "learning_rate": 0.0005130900210842185, "loss": 4.141, "step": 4429 }, { "epoch": 0.51, "grad_norm": 0.8204509395655092, "learning_rate": 0.000512904388559428, "loss": 3.9, "step": 4430 }, { "epoch": 0.51, "grad_norm": 0.6304546245555973, "learning_rate": 0.000512718754254723, "loss": 3.7579, "step": 4431 }, { "epoch": 0.51, "grad_norm": 0.6988784783887135, "learning_rate": 0.0005125331181957083, "loss": 3.9411, "step": 4432 }, { "epoch": 0.51, "grad_norm": 2.1980708050073754, "learning_rate": 0.000512347480407989, "loss": 4.0388, "step": 4433 }, { "epoch": 0.51, "grad_norm": 0.7607761869368107, "learning_rate": 0.00051216184091717, "loss": 4.0144, "step": 4434 }, { "epoch": 0.51, "grad_norm": 0.6746803088935472, "learning_rate": 0.0005119761997488569, "loss": 3.8825, "step": 4435 }, { "epoch": 0.51, "grad_norm": 0.6293219231350242, "learning_rate": 0.0005117905569286552, "loss": 3.8482, "step": 4436 }, { "epoch": 0.51, "grad_norm": 0.7421983562012114, "learning_rate": 0.0005116049124821713, "loss": 3.7897, "step": 4437 }, { "epoch": 0.51, "grad_norm": 1.6388084202405335, "learning_rate": 0.0005114192664350107, "loss": 3.8245, "step": 4438 }, { "epoch": 0.51, "grad_norm": 0.638198014398492, "learning_rate": 0.0005112336188127798, "loss": 3.7014, "step": 4439 }, { "epoch": 0.51, "grad_norm": 1.0095036370497699, "learning_rate": 0.0005110479696410857, "loss": 3.7497, "step": 4440 }, { "epoch": 0.51, "grad_norm": 0.7013502365461828, "learning_rate": 0.0005108623189455343, "loss": 3.9945, "step": 4441 }, { "epoch": 0.51, "grad_norm": 0.7025627557210031, "learning_rate": 0.0005106766667517335, "loss": 4.1263, "step": 4442 }, { "epoch": 0.51, "grad_norm": 0.6889109435701478, "learning_rate": 0.0005104910130852899, "loss": 3.932, "step": 4443 }, { "epoch": 0.51, "grad_norm": 0.6961392097799216, "learning_rate": 0.0005103053579718109, "loss": 4.0102, "step": 4444 }, { "epoch": 0.51, "grad_norm": 0.613249233520474, "learning_rate": 0.0005101197014369043, "loss": 3.6956, "step": 4445 }, { "epoch": 0.51, "grad_norm": 0.6976960715506029, "learning_rate": 0.0005099340435061778, "loss": 3.9479, "step": 4446 }, { "epoch": 0.51, "grad_norm": 0.7530633624192621, "learning_rate": 0.0005097483842052393, "loss": 3.7851, "step": 4447 }, { "epoch": 0.51, "grad_norm": 1.0323790048566046, "learning_rate": 0.000509562723559697, "loss": 4.0747, "step": 4448 }, { "epoch": 0.51, "grad_norm": 0.6878238508408644, "learning_rate": 0.0005093770615951594, "loss": 4.012, "step": 4449 }, { "epoch": 0.51, "grad_norm": 0.7019604580454764, "learning_rate": 0.0005091913983372347, "loss": 3.6853, "step": 4450 }, { "epoch": 0.51, "grad_norm": 0.7849089157135006, "learning_rate": 0.0005090057338115319, "loss": 3.9267, "step": 4451 }, { "epoch": 0.51, "grad_norm": 0.7372080177256743, "learning_rate": 0.0005088200680436599, "loss": 3.7817, "step": 4452 }, { "epoch": 0.51, "grad_norm": 0.7309251864880948, "learning_rate": 0.0005086344010592275, "loss": 3.9065, "step": 4453 }, { "epoch": 0.51, "grad_norm": 0.9809484024784882, "learning_rate": 0.0005084487328838441, "loss": 3.9401, "step": 4454 }, { "epoch": 0.51, "grad_norm": 1.3354661471387699, "learning_rate": 0.0005082630635431191, "loss": 3.7396, "step": 4455 }, { "epoch": 0.51, "grad_norm": 0.694307745159674, "learning_rate": 0.0005080773930626619, "loss": 3.9724, "step": 4456 }, { "epoch": 0.51, "grad_norm": 0.8779913059713916, "learning_rate": 0.0005078917214680822, "loss": 3.9794, "step": 4457 }, { "epoch": 0.51, "grad_norm": 0.7042090268198159, "learning_rate": 0.0005077060487849903, "loss": 3.719, "step": 4458 }, { "epoch": 0.51, "grad_norm": 0.6457111016124838, "learning_rate": 0.0005075203750389956, "loss": 3.9746, "step": 4459 }, { "epoch": 0.51, "grad_norm": 0.7588704947154931, "learning_rate": 0.0005073347002557085, "loss": 4.0939, "step": 4460 }, { "epoch": 0.51, "grad_norm": 0.6905065795609185, "learning_rate": 0.0005071490244607395, "loss": 3.9236, "step": 4461 }, { "epoch": 0.51, "grad_norm": 0.6317106605632934, "learning_rate": 0.0005069633476796986, "loss": 3.8212, "step": 4462 }, { "epoch": 0.51, "grad_norm": 0.6697734340460312, "learning_rate": 0.0005067776699381969, "loss": 3.8996, "step": 4463 }, { "epoch": 0.51, "grad_norm": 0.637531486320941, "learning_rate": 0.0005065919912618446, "loss": 3.9474, "step": 4464 }, { "epoch": 0.51, "grad_norm": 0.7427565911840636, "learning_rate": 0.0005064063116762529, "loss": 4.1795, "step": 4465 }, { "epoch": 0.51, "grad_norm": 0.6996792686034751, "learning_rate": 0.0005062206312070323, "loss": 3.8792, "step": 4466 }, { "epoch": 0.51, "grad_norm": 1.0006494524879523, "learning_rate": 0.0005060349498797945, "loss": 3.6578, "step": 4467 }, { "epoch": 0.51, "grad_norm": 0.6063535674917401, "learning_rate": 0.0005058492677201505, "loss": 3.8894, "step": 4468 }, { "epoch": 0.51, "grad_norm": 0.7163827153165885, "learning_rate": 0.0005056635847537112, "loss": 4.0838, "step": 4469 }, { "epoch": 0.51, "grad_norm": 0.7329798048625743, "learning_rate": 0.0005054779010060886, "loss": 3.7144, "step": 4470 }, { "epoch": 0.51, "grad_norm": 0.6849190904583171, "learning_rate": 0.0005052922165028939, "loss": 4.1429, "step": 4471 }, { "epoch": 0.51, "grad_norm": 0.6076086702590433, "learning_rate": 0.0005051065312697387, "loss": 3.8245, "step": 4472 }, { "epoch": 0.51, "grad_norm": 1.1766681679500233, "learning_rate": 0.0005049208453322352, "loss": 3.7827, "step": 4473 }, { "epoch": 0.51, "grad_norm": 0.699577323977227, "learning_rate": 0.0005047351587159945, "loss": 3.9705, "step": 4474 }, { "epoch": 0.51, "grad_norm": 0.7459222246621523, "learning_rate": 0.000504549471446629, "loss": 3.7213, "step": 4475 }, { "epoch": 0.51, "grad_norm": 0.7798314265036961, "learning_rate": 0.0005043637835497507, "loss": 4.0479, "step": 4476 }, { "epoch": 0.51, "grad_norm": 0.6177494092175755, "learning_rate": 0.0005041780950509716, "loss": 3.8241, "step": 4477 }, { "epoch": 0.51, "grad_norm": 0.6860317363251536, "learning_rate": 0.000503992405975904, "loss": 4.017, "step": 4478 }, { "epoch": 0.51, "grad_norm": 0.624501089764449, "learning_rate": 0.00050380671635016, "loss": 4.1095, "step": 4479 }, { "epoch": 0.51, "grad_norm": 1.4517372383607579, "learning_rate": 0.0005036210261993523, "loss": 3.8872, "step": 4480 }, { "epoch": 0.51, "grad_norm": 0.6611967312543501, "learning_rate": 0.0005034353355490927, "loss": 3.8398, "step": 4481 }, { "epoch": 0.51, "grad_norm": 0.8456387237916634, "learning_rate": 0.0005032496444249943, "loss": 3.7342, "step": 4482 }, { "epoch": 0.51, "grad_norm": 0.7233887345500365, "learning_rate": 0.0005030639528526693, "loss": 3.8875, "step": 4483 }, { "epoch": 0.51, "grad_norm": 0.6341588197288387, "learning_rate": 0.0005028782608577304, "loss": 3.7504, "step": 4484 }, { "epoch": 0.51, "grad_norm": 0.6879094488234176, "learning_rate": 0.0005026925684657902, "loss": 3.917, "step": 4485 }, { "epoch": 0.51, "grad_norm": 0.8567504162349512, "learning_rate": 0.0005025068757024616, "loss": 4.0542, "step": 4486 }, { "epoch": 0.51, "grad_norm": 0.6559315354399435, "learning_rate": 0.0005023211825933572, "loss": 4.0226, "step": 4487 }, { "epoch": 0.51, "grad_norm": 0.7650561971392466, "learning_rate": 0.00050213548916409, "loss": 3.8723, "step": 4488 }, { "epoch": 0.51, "grad_norm": 1.4636090997506657, "learning_rate": 0.0005019497954402728, "loss": 3.8555, "step": 4489 }, { "epoch": 0.51, "grad_norm": 0.8846292832928477, "learning_rate": 0.0005017641014475184, "loss": 3.8701, "step": 4490 }, { "epoch": 0.51, "grad_norm": 0.8087866730305062, "learning_rate": 0.0005015784072114397, "loss": 3.957, "step": 4491 }, { "epoch": 0.52, "grad_norm": 1.5441202649553, "learning_rate": 0.0005013927127576501, "loss": 3.7416, "step": 4492 }, { "epoch": 0.52, "grad_norm": 0.6006882086342504, "learning_rate": 0.0005012070181117621, "loss": 3.8194, "step": 4493 }, { "epoch": 0.52, "grad_norm": 0.7107663883743528, "learning_rate": 0.0005010213232993891, "loss": 3.8645, "step": 4494 }, { "epoch": 0.52, "grad_norm": 0.6706493472737447, "learning_rate": 0.000500835628346144, "loss": 4.033, "step": 4495 }, { "epoch": 0.52, "grad_norm": 0.7376474088565605, "learning_rate": 0.00050064993327764, "loss": 3.9634, "step": 4496 }, { "epoch": 0.52, "grad_norm": 0.6738659279984577, "learning_rate": 0.0005004642381194899, "loss": 3.7924, "step": 4497 }, { "epoch": 0.52, "grad_norm": 1.15752110809593, "learning_rate": 0.0005002785428973071, "loss": 3.7193, "step": 4498 }, { "epoch": 0.52, "grad_norm": 0.629189611895454, "learning_rate": 0.0005000928476367046, "loss": 3.9295, "step": 4499 }, { "epoch": 0.52, "grad_norm": 0.6564001834287247, "learning_rate": 0.0004999071523632954, "loss": 3.9296, "step": 4500 }, { "epoch": 0.52, "grad_norm": 0.6677302443223629, "learning_rate": 0.000499721457102693, "loss": 3.901, "step": 4501 }, { "epoch": 0.52, "grad_norm": 0.8408351520323896, "learning_rate": 0.0004995357618805102, "loss": 3.8231, "step": 4502 }, { "epoch": 0.52, "grad_norm": 0.8976081182120693, "learning_rate": 0.0004993500667223601, "loss": 3.9373, "step": 4503 }, { "epoch": 0.52, "grad_norm": 0.760622969554225, "learning_rate": 0.0004991643716538561, "loss": 3.7838, "step": 4504 }, { "epoch": 0.52, "grad_norm": 3.0009510815773197, "learning_rate": 0.0004989786767006109, "loss": 3.7823, "step": 4505 }, { "epoch": 0.52, "grad_norm": 0.7782442274632977, "learning_rate": 0.0004987929818882379, "loss": 3.6754, "step": 4506 }, { "epoch": 0.52, "grad_norm": 0.6625267998709758, "learning_rate": 0.0004986072872423499, "loss": 3.8287, "step": 4507 }, { "epoch": 0.52, "grad_norm": 1.31235704461662, "learning_rate": 0.0004984215927885603, "loss": 3.9413, "step": 4508 }, { "epoch": 0.52, "grad_norm": 0.646839094222067, "learning_rate": 0.0004982358985524817, "loss": 3.6649, "step": 4509 }, { "epoch": 0.52, "grad_norm": 0.8002471407164435, "learning_rate": 0.0004980502045597272, "loss": 4.1684, "step": 4510 }, { "epoch": 0.52, "grad_norm": 1.0076329120766796, "learning_rate": 0.00049786451083591, "loss": 3.9883, "step": 4511 }, { "epoch": 0.52, "grad_norm": 0.825441966763688, "learning_rate": 0.0004976788174066428, "loss": 3.6736, "step": 4512 }, { "epoch": 0.52, "grad_norm": 0.6959683334293064, "learning_rate": 0.0004974931242975385, "loss": 3.8048, "step": 4513 }, { "epoch": 0.52, "grad_norm": 0.7603304607010152, "learning_rate": 0.0004973074315342101, "loss": 3.8946, "step": 4514 }, { "epoch": 0.52, "grad_norm": 0.950674771687789, "learning_rate": 0.0004971217391422697, "loss": 4.154, "step": 4515 }, { "epoch": 0.52, "grad_norm": 0.8940376437531288, "learning_rate": 0.0004969360471473309, "loss": 3.8315, "step": 4516 }, { "epoch": 0.52, "grad_norm": 1.6778294103874054, "learning_rate": 0.0004967503555750059, "loss": 3.9383, "step": 4517 }, { "epoch": 0.52, "grad_norm": 0.7174912414374011, "learning_rate": 0.0004965646644509074, "loss": 3.985, "step": 4518 }, { "epoch": 0.52, "grad_norm": 0.7693735892009127, "learning_rate": 0.000496378973800648, "loss": 3.9009, "step": 4519 }, { "epoch": 0.52, "grad_norm": 0.720233177983674, "learning_rate": 0.00049619328364984, "loss": 4.0251, "step": 4520 }, { "epoch": 0.52, "grad_norm": 0.7926915220264754, "learning_rate": 0.0004960075940240961, "loss": 3.8099, "step": 4521 }, { "epoch": 0.52, "grad_norm": 1.110531885458151, "learning_rate": 0.0004958219049490284, "loss": 3.7312, "step": 4522 }, { "epoch": 0.52, "grad_norm": 0.7621177255246574, "learning_rate": 0.0004956362164502495, "loss": 3.8471, "step": 4523 }, { "epoch": 0.52, "grad_norm": 0.6973918703499692, "learning_rate": 0.0004954505285533711, "loss": 3.6011, "step": 4524 }, { "epoch": 0.52, "grad_norm": 0.6672577137147394, "learning_rate": 0.0004952648412840056, "loss": 3.5946, "step": 4525 }, { "epoch": 0.52, "grad_norm": 0.6428044681155124, "learning_rate": 0.000495079154667765, "loss": 3.8969, "step": 4526 }, { "epoch": 0.52, "grad_norm": 0.6994904064491224, "learning_rate": 0.0004948934687302614, "loss": 3.8602, "step": 4527 }, { "epoch": 0.52, "grad_norm": 0.928479661623397, "learning_rate": 0.0004947077834971061, "loss": 3.9046, "step": 4528 }, { "epoch": 0.52, "grad_norm": 0.8767120039674479, "learning_rate": 0.0004945220989939115, "loss": 3.9371, "step": 4529 }, { "epoch": 0.52, "grad_norm": 0.7523592467522092, "learning_rate": 0.0004943364152462887, "loss": 3.7054, "step": 4530 }, { "epoch": 0.52, "grad_norm": 0.6131060211340931, "learning_rate": 0.0004941507322798496, "loss": 3.6901, "step": 4531 }, { "epoch": 0.52, "grad_norm": 0.7298569721087329, "learning_rate": 0.0004939650501202055, "loss": 3.8738, "step": 4532 }, { "epoch": 0.52, "grad_norm": 0.6935131722818929, "learning_rate": 0.0004937793687929677, "loss": 3.9521, "step": 4533 }, { "epoch": 0.52, "grad_norm": 0.7046703474833415, "learning_rate": 0.0004935936883237474, "loss": 3.6911, "step": 4534 }, { "epoch": 0.52, "grad_norm": 0.768082730504586, "learning_rate": 0.0004934080087381555, "loss": 3.7459, "step": 4535 }, { "epoch": 0.52, "grad_norm": 0.377706420266484, "learning_rate": 0.0004932223300618033, "loss": 4.0084, "step": 4536 }, { "epoch": 0.52, "grad_norm": 0.6225590380967867, "learning_rate": 0.0004930366523203014, "loss": 3.6567, "step": 4537 }, { "epoch": 0.52, "grad_norm": 0.6782092321535321, "learning_rate": 0.0004928509755392607, "loss": 3.8593, "step": 4538 }, { "epoch": 0.52, "grad_norm": 0.6687967706943265, "learning_rate": 0.0004926652997442917, "loss": 3.8292, "step": 4539 }, { "epoch": 0.52, "grad_norm": 0.6785548858250524, "learning_rate": 0.0004924796249610045, "loss": 3.7158, "step": 4540 }, { "epoch": 0.52, "grad_norm": 0.7146326596544677, "learning_rate": 0.0004922939512150098, "loss": 3.9781, "step": 4541 }, { "epoch": 0.52, "grad_norm": 0.6566211406081467, "learning_rate": 0.0004921082785319178, "loss": 3.6622, "step": 4542 }, { "epoch": 0.52, "grad_norm": 0.7089065653540122, "learning_rate": 0.0004919226069373382, "loss": 3.8517, "step": 4543 }, { "epoch": 0.52, "grad_norm": 0.812854148740293, "learning_rate": 0.0004917369364568811, "loss": 3.6759, "step": 4544 }, { "epoch": 0.52, "grad_norm": 0.5926669428829177, "learning_rate": 0.0004915512671161559, "loss": 3.6179, "step": 4545 }, { "epoch": 0.52, "grad_norm": 0.6667947858261447, "learning_rate": 0.0004913655989407726, "loss": 3.7115, "step": 4546 }, { "epoch": 0.52, "grad_norm": 0.6977969673123785, "learning_rate": 0.0004911799319563402, "loss": 3.9032, "step": 4547 }, { "epoch": 0.52, "grad_norm": 0.5786531510050691, "learning_rate": 0.0004909942661884681, "loss": 3.9285, "step": 4548 }, { "epoch": 0.52, "grad_norm": 0.7427473586253307, "learning_rate": 0.0004908086016627653, "loss": 3.707, "step": 4549 }, { "epoch": 0.52, "grad_norm": 0.6726075890976019, "learning_rate": 0.0004906229384048407, "loss": 3.9704, "step": 4550 }, { "epoch": 0.52, "grad_norm": 1.268577490039533, "learning_rate": 0.000490437276440303, "loss": 3.775, "step": 4551 }, { "epoch": 0.52, "grad_norm": 0.6081393784530363, "learning_rate": 0.0004902516157947608, "loss": 3.8651, "step": 4552 }, { "epoch": 0.52, "grad_norm": 0.6655913911871548, "learning_rate": 0.0004900659564938223, "loss": 3.941, "step": 4553 }, { "epoch": 0.52, "grad_norm": 0.7764596493381106, "learning_rate": 0.000489880298563096, "loss": 3.9088, "step": 4554 }, { "epoch": 0.52, "grad_norm": 0.8719955393109148, "learning_rate": 0.0004896946420281891, "loss": 3.7988, "step": 4555 }, { "epoch": 0.52, "grad_norm": 0.6446123033607158, "learning_rate": 0.0004895089869147102, "loss": 3.7559, "step": 4556 }, { "epoch": 0.52, "grad_norm": 0.5839973705443173, "learning_rate": 0.0004893233332482666, "loss": 3.7153, "step": 4557 }, { "epoch": 0.52, "grad_norm": 0.6551712868693319, "learning_rate": 0.0004891376810544657, "loss": 3.6985, "step": 4558 }, { "epoch": 0.52, "grad_norm": 0.7314142363728047, "learning_rate": 0.0004889520303589146, "loss": 4.1195, "step": 4559 }, { "epoch": 0.52, "grad_norm": 1.3735474564126573, "learning_rate": 0.0004887663811872201, "loss": 3.8307, "step": 4560 }, { "epoch": 0.52, "grad_norm": 0.6427744869630965, "learning_rate": 0.0004885807335649894, "loss": 3.8593, "step": 4561 }, { "epoch": 0.52, "grad_norm": 0.6860185354776089, "learning_rate": 0.0004883950875178288, "loss": 3.7505, "step": 4562 }, { "epoch": 0.52, "grad_norm": 0.6415726059580675, "learning_rate": 0.0004882094430713447, "loss": 4.0498, "step": 4563 }, { "epoch": 0.52, "grad_norm": 0.9287397251510705, "learning_rate": 0.00048802380025114326, "loss": 4.0113, "step": 4564 }, { "epoch": 0.52, "grad_norm": 0.6790304687568748, "learning_rate": 0.00048783815908283, "loss": 4.0323, "step": 4565 }, { "epoch": 0.52, "grad_norm": 0.6257453542490727, "learning_rate": 0.00048765251959201106, "loss": 3.732, "step": 4566 }, { "epoch": 0.52, "grad_norm": 0.6259256446401436, "learning_rate": 0.00048746688180429173, "loss": 3.7672, "step": 4567 }, { "epoch": 0.52, "grad_norm": 0.64174211401797, "learning_rate": 0.00048728124574527705, "loss": 3.84, "step": 4568 }, { "epoch": 0.52, "grad_norm": 0.7266564168950975, "learning_rate": 0.00048709561144057216, "loss": 3.7238, "step": 4569 }, { "epoch": 0.52, "grad_norm": 0.7173094992755242, "learning_rate": 0.00048690997891578155, "loss": 4.0728, "step": 4570 }, { "epoch": 0.52, "grad_norm": 1.1578704603692354, "learning_rate": 0.00048672434819651, "loss": 3.8308, "step": 4571 }, { "epoch": 0.52, "grad_norm": 0.7230746951027772, "learning_rate": 0.0004865387193083615, "loss": 3.8675, "step": 4572 }, { "epoch": 0.52, "grad_norm": 0.6346827616994839, "learning_rate": 0.0004863530922769401, "loss": 3.9345, "step": 4573 }, { "epoch": 0.52, "grad_norm": 0.6803003292129063, "learning_rate": 0.0004861674671278494, "loss": 3.9632, "step": 4574 }, { "epoch": 0.52, "grad_norm": 0.6392274092927573, "learning_rate": 0.0004859818438866928, "loss": 3.714, "step": 4575 }, { "epoch": 0.52, "grad_norm": 0.6620672291359806, "learning_rate": 0.0004857962225790739, "loss": 3.7282, "step": 4576 }, { "epoch": 0.52, "grad_norm": 0.7664628742256426, "learning_rate": 0.00048561060323059534, "loss": 3.7966, "step": 4577 }, { "epoch": 0.52, "grad_norm": 1.0573554733434487, "learning_rate": 0.0004854249858668597, "loss": 3.8369, "step": 4578 }, { "epoch": 0.53, "grad_norm": 0.6743405420709296, "learning_rate": 0.0004852393705134695, "loss": 3.8725, "step": 4579 }, { "epoch": 0.53, "grad_norm": 0.6654323262697931, "learning_rate": 0.0004850537571960266, "loss": 3.7291, "step": 4580 }, { "epoch": 0.53, "grad_norm": 0.7253980992443212, "learning_rate": 0.00048486814594013303, "loss": 3.8435, "step": 4581 }, { "epoch": 0.53, "grad_norm": 0.6289294774340574, "learning_rate": 0.0004846825367713904, "loss": 3.6516, "step": 4582 }, { "epoch": 0.53, "grad_norm": 0.5854216102343045, "learning_rate": 0.0004844969297153999, "loss": 3.6293, "step": 4583 }, { "epoch": 0.53, "grad_norm": 0.7019691687477152, "learning_rate": 0.00048431132479776227, "loss": 3.893, "step": 4584 }, { "epoch": 0.53, "grad_norm": 1.147640198516509, "learning_rate": 0.00048412572204407825, "loss": 3.8207, "step": 4585 }, { "epoch": 0.53, "grad_norm": 0.6898726171242755, "learning_rate": 0.00048394012147994853, "loss": 3.8328, "step": 4586 }, { "epoch": 0.53, "grad_norm": 0.6097244813524376, "learning_rate": 0.0004837545231309728, "loss": 3.6195, "step": 4587 }, { "epoch": 0.53, "grad_norm": 0.6355915694446208, "learning_rate": 0.000483568927022751, "loss": 3.7989, "step": 4588 }, { "epoch": 0.53, "grad_norm": 0.7327532950204162, "learning_rate": 0.00048338333318088256, "loss": 3.9135, "step": 4589 }, { "epoch": 0.53, "grad_norm": 0.6406182020332919, "learning_rate": 0.0004831977416309663, "loss": 3.9765, "step": 4590 }, { "epoch": 0.53, "grad_norm": 0.6321033290526714, "learning_rate": 0.00048301215239860145, "loss": 3.8803, "step": 4591 }, { "epoch": 0.53, "grad_norm": 0.861160353663047, "learning_rate": 0.0004828265655093865, "loss": 3.9288, "step": 4592 }, { "epoch": 0.53, "grad_norm": 0.7613978868208351, "learning_rate": 0.0004826409809889193, "loss": 3.8669, "step": 4593 }, { "epoch": 0.53, "grad_norm": 0.9930012501274026, "learning_rate": 0.000482455398862798, "loss": 3.7654, "step": 4594 }, { "epoch": 0.53, "grad_norm": 0.635871238966408, "learning_rate": 0.00048226981915661983, "loss": 3.923, "step": 4595 }, { "epoch": 0.53, "grad_norm": 0.7637225451985097, "learning_rate": 0.00048208424189598233, "loss": 3.8052, "step": 4596 }, { "epoch": 0.53, "grad_norm": 1.2564799203131192, "learning_rate": 0.000481898667106482, "loss": 4.1383, "step": 4597 }, { "epoch": 0.53, "grad_norm": 0.707148184379511, "learning_rate": 0.0004817130948137157, "loss": 3.8698, "step": 4598 }, { "epoch": 0.53, "grad_norm": 1.7729479523436684, "learning_rate": 0.0004815275250432792, "loss": 3.8985, "step": 4599 }, { "epoch": 0.53, "grad_norm": 0.7304895395431245, "learning_rate": 0.0004813419578207684, "loss": 3.9457, "step": 4600 }, { "epoch": 0.53, "grad_norm": 0.923994287820989, "learning_rate": 0.0004811563931717791, "loss": 4.179, "step": 4601 }, { "epoch": 0.53, "grad_norm": 0.6839847584722576, "learning_rate": 0.0004809708311219062, "loss": 4.1005, "step": 4602 }, { "epoch": 0.53, "grad_norm": 0.7556248185689219, "learning_rate": 0.00048078527169674427, "loss": 3.5449, "step": 4603 }, { "epoch": 0.53, "grad_norm": 0.8089215691717822, "learning_rate": 0.000480599714921888, "loss": 3.5299, "step": 4604 }, { "epoch": 0.53, "grad_norm": 0.8807839588909995, "learning_rate": 0.000480414160822931, "loss": 3.9293, "step": 4605 }, { "epoch": 0.53, "grad_norm": 1.076020455050776, "learning_rate": 0.0004802286094254673, "loss": 3.6507, "step": 4606 }, { "epoch": 0.53, "grad_norm": 1.094588858075301, "learning_rate": 0.0004800430607550901, "loss": 3.9444, "step": 4607 }, { "epoch": 0.53, "grad_norm": 0.6396116146850653, "learning_rate": 0.0004798575148373923, "loss": 3.9609, "step": 4608 }, { "epoch": 0.53, "grad_norm": 0.6327756687674794, "learning_rate": 0.0004796719716979663, "loss": 3.9381, "step": 4609 }, { "epoch": 0.53, "grad_norm": 0.6435227032016294, "learning_rate": 0.00047948643136240423, "loss": 3.8592, "step": 4610 }, { "epoch": 0.53, "grad_norm": 0.7332164307089346, "learning_rate": 0.00047930089385629806, "loss": 3.9285, "step": 4611 }, { "epoch": 0.53, "grad_norm": 0.6690511734620389, "learning_rate": 0.00047911535920523897, "loss": 3.6763, "step": 4612 }, { "epoch": 0.53, "grad_norm": 0.6225103022100422, "learning_rate": 0.00047892982743481805, "loss": 3.7435, "step": 4613 }, { "epoch": 0.53, "grad_norm": 0.6015745341922872, "learning_rate": 0.0004787442985706259, "loss": 3.7547, "step": 4614 }, { "epoch": 0.53, "grad_norm": 0.773509135219887, "learning_rate": 0.00047855877263825223, "loss": 3.8501, "step": 4615 }, { "epoch": 0.53, "grad_norm": 0.7209885331977977, "learning_rate": 0.0004783732496632873, "loss": 3.8679, "step": 4616 }, { "epoch": 0.53, "grad_norm": 0.617846080716903, "learning_rate": 0.0004781877296713205, "loss": 3.7009, "step": 4617 }, { "epoch": 0.53, "grad_norm": 0.6450979478678796, "learning_rate": 0.00047800221268794055, "loss": 3.5454, "step": 4618 }, { "epoch": 0.53, "grad_norm": 0.766838489698297, "learning_rate": 0.0004778166987387361, "loss": 3.7126, "step": 4619 }, { "epoch": 0.53, "grad_norm": 0.6694456686575371, "learning_rate": 0.00047763118784929494, "loss": 3.7321, "step": 4620 }, { "epoch": 0.53, "grad_norm": 0.657225800303527, "learning_rate": 0.00047744568004520527, "loss": 3.6254, "step": 4621 }, { "epoch": 0.53, "grad_norm": 0.7378215256197929, "learning_rate": 0.000477260175352054, "loss": 3.6661, "step": 4622 }, { "epoch": 0.53, "grad_norm": 0.7061433323932471, "learning_rate": 0.0004770746737954282, "loss": 3.7522, "step": 4623 }, { "epoch": 0.53, "grad_norm": 0.8107853097652231, "learning_rate": 0.0004768891754009141, "loss": 3.9038, "step": 4624 }, { "epoch": 0.53, "grad_norm": 0.9452609196534427, "learning_rate": 0.00047670368019409753, "loss": 3.8686, "step": 4625 }, { "epoch": 0.53, "grad_norm": 0.7231148944255227, "learning_rate": 0.00047651818820056445, "loss": 3.7554, "step": 4626 }, { "epoch": 0.53, "grad_norm": 0.7449675578068661, "learning_rate": 0.00047633269944589974, "loss": 4.0211, "step": 4627 }, { "epoch": 0.53, "grad_norm": 0.6605313943168138, "learning_rate": 0.00047614721395568786, "loss": 4.0067, "step": 4628 }, { "epoch": 0.53, "grad_norm": 0.6440987874230606, "learning_rate": 0.0004759617317555133, "loss": 3.7412, "step": 4629 }, { "epoch": 0.53, "grad_norm": 0.6361627081686643, "learning_rate": 0.0004757762528709594, "loss": 3.848, "step": 4630 } ], "logging_steps": 1.0, "max_steps": 8721, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "total_flos": 15142711762944.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }