{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.457236842105264, "eval_steps": 500, "global_step": 23000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004111842105263158, "grad_norm": 82.21414947509766, "learning_rate": 9.997532894736843e-06, "loss": 1.0101, "step": 10 }, { "epoch": 0.008223684210526315, "grad_norm": 79.48388671875, "learning_rate": 9.993832236842106e-06, "loss": 1.3237, "step": 20 }, { "epoch": 0.012335526315789474, "grad_norm": 33.75522232055664, "learning_rate": 9.989720394736842e-06, "loss": 0.596, "step": 30 }, { "epoch": 0.01644736842105263, "grad_norm": 0.831979513168335, "learning_rate": 9.98560855263158e-06, "loss": 0.8206, "step": 40 }, { "epoch": 0.02055921052631579, "grad_norm": 58.79596710205078, "learning_rate": 9.981496710526317e-06, "loss": 1.0928, "step": 50 }, { "epoch": 0.024671052631578948, "grad_norm": 51.91939926147461, "learning_rate": 9.977384868421054e-06, "loss": 0.7411, "step": 60 }, { "epoch": 0.028782894736842105, "grad_norm": 69.77727508544922, "learning_rate": 9.97327302631579e-06, "loss": 0.5594, "step": 70 }, { "epoch": 0.03289473684210526, "grad_norm": 69.18291473388672, "learning_rate": 9.969161184210527e-06, "loss": 1.1593, "step": 80 }, { "epoch": 0.03700657894736842, "grad_norm": 25.222484588623047, "learning_rate": 9.965049342105263e-06, "loss": 0.4795, "step": 90 }, { "epoch": 0.04111842105263158, "grad_norm": 65.4477767944336, "learning_rate": 9.9609375e-06, "loss": 0.7011, "step": 100 }, { "epoch": 0.04523026315789474, "grad_norm": 23.820554733276367, "learning_rate": 9.956825657894738e-06, "loss": 0.3759, "step": 110 }, { "epoch": 0.049342105263157895, "grad_norm": 126.01136016845703, "learning_rate": 9.952713815789474e-06, "loss": 1.2958, "step": 120 }, { "epoch": 0.05345394736842105, "grad_norm": 9.699002265930176, "learning_rate": 9.948601973684211e-06, "loss": 0.7123, "step": 130 }, { "epoch": 0.05756578947368421, "grad_norm": 13.736701965332031, "learning_rate": 9.944490131578947e-06, "loss": 1.2954, "step": 140 }, { "epoch": 0.061677631578947366, "grad_norm": 7.952981472015381, "learning_rate": 9.940378289473686e-06, "loss": 0.5001, "step": 150 }, { "epoch": 0.06578947368421052, "grad_norm": 70.3082504272461, "learning_rate": 9.936266447368422e-06, "loss": 0.909, "step": 160 }, { "epoch": 0.06990131578947369, "grad_norm": 51.05005645751953, "learning_rate": 9.93215460526316e-06, "loss": 0.9518, "step": 170 }, { "epoch": 0.07401315789473684, "grad_norm": 41.0015983581543, "learning_rate": 9.928042763157895e-06, "loss": 0.7172, "step": 180 }, { "epoch": 0.078125, "grad_norm": 51.35481262207031, "learning_rate": 9.923930921052633e-06, "loss": 1.106, "step": 190 }, { "epoch": 0.08223684210526316, "grad_norm": 0.127091646194458, "learning_rate": 9.919819078947368e-06, "loss": 0.3531, "step": 200 }, { "epoch": 0.08634868421052631, "grad_norm": 5.155969142913818, "learning_rate": 9.915707236842106e-06, "loss": 0.6066, "step": 210 }, { "epoch": 0.09046052631578948, "grad_norm": 23.830692291259766, "learning_rate": 9.911595394736843e-06, "loss": 0.6448, "step": 220 }, { "epoch": 0.09457236842105263, "grad_norm": 0.5406038761138916, "learning_rate": 9.90748355263158e-06, "loss": 0.7219, "step": 230 }, { "epoch": 0.09868421052631579, "grad_norm": 29.625507354736328, "learning_rate": 9.903371710526317e-06, "loss": 0.522, "step": 240 }, { "epoch": 0.10279605263157894, "grad_norm": 26.483999252319336, "learning_rate": 9.89967105263158e-06, "loss": 0.4394, "step": 250 }, { "epoch": 0.1069078947368421, "grad_norm": 56.8378791809082, "learning_rate": 9.895559210526317e-06, "loss": 0.8402, "step": 260 }, { "epoch": 0.11101973684210527, "grad_norm": 30.779541015625, "learning_rate": 9.891447368421053e-06, "loss": 0.639, "step": 270 }, { "epoch": 0.11513157894736842, "grad_norm": 0.07414458692073822, "learning_rate": 9.88733552631579e-06, "loss": 0.6098, "step": 280 }, { "epoch": 0.11924342105263158, "grad_norm": 0.05630555748939514, "learning_rate": 9.883223684210526e-06, "loss": 0.2677, "step": 290 }, { "epoch": 0.12335526315789473, "grad_norm": 12.017321586608887, "learning_rate": 9.879111842105264e-06, "loss": 0.1545, "step": 300 }, { "epoch": 0.12746710526315788, "grad_norm": 114.26238250732422, "learning_rate": 9.875000000000001e-06, "loss": 0.8309, "step": 310 }, { "epoch": 0.13157894736842105, "grad_norm": 58.30079650878906, "learning_rate": 9.870888157894739e-06, "loss": 0.5668, "step": 320 }, { "epoch": 0.1356907894736842, "grad_norm": 0.042588502168655396, "learning_rate": 9.866776315789474e-06, "loss": 0.6739, "step": 330 }, { "epoch": 0.13980263157894737, "grad_norm": 53.37122344970703, "learning_rate": 9.862664473684212e-06, "loss": 1.2924, "step": 340 }, { "epoch": 0.14391447368421054, "grad_norm": 66.1600341796875, "learning_rate": 9.858552631578948e-06, "loss": 0.9956, "step": 350 }, { "epoch": 0.14802631578947367, "grad_norm": 26.073034286499023, "learning_rate": 9.854440789473685e-06, "loss": 0.1696, "step": 360 }, { "epoch": 0.15213815789473684, "grad_norm": 86.73162078857422, "learning_rate": 9.85032894736842e-06, "loss": 0.7324, "step": 370 }, { "epoch": 0.15625, "grad_norm": 5.856919288635254, "learning_rate": 9.846217105263158e-06, "loss": 0.6496, "step": 380 }, { "epoch": 0.16036184210526316, "grad_norm": 0.010420837439596653, "learning_rate": 9.842105263157896e-06, "loss": 1.0035, "step": 390 }, { "epoch": 0.16447368421052633, "grad_norm": 4.881348609924316, "learning_rate": 9.837993421052633e-06, "loss": 0.1168, "step": 400 }, { "epoch": 0.16858552631578946, "grad_norm": 1.67402184009552, "learning_rate": 9.833881578947369e-06, "loss": 0.7795, "step": 410 }, { "epoch": 0.17269736842105263, "grad_norm": 28.87067222595215, "learning_rate": 9.829769736842106e-06, "loss": 0.661, "step": 420 }, { "epoch": 0.1768092105263158, "grad_norm": 59.759552001953125, "learning_rate": 9.825657894736844e-06, "loss": 0.7666, "step": 430 }, { "epoch": 0.18092105263157895, "grad_norm": 33.859188079833984, "learning_rate": 9.82154605263158e-06, "loss": 0.3961, "step": 440 }, { "epoch": 0.18503289473684212, "grad_norm": 56.692955017089844, "learning_rate": 9.817434210526317e-06, "loss": 0.7028, "step": 450 }, { "epoch": 0.18914473684210525, "grad_norm": 74.9236068725586, "learning_rate": 9.813322368421053e-06, "loss": 0.6201, "step": 460 }, { "epoch": 0.19325657894736842, "grad_norm": 11.421184539794922, "learning_rate": 9.80921052631579e-06, "loss": 0.6496, "step": 470 }, { "epoch": 0.19736842105263158, "grad_norm": 64.66961669921875, "learning_rate": 9.805098684210526e-06, "loss": 0.5065, "step": 480 }, { "epoch": 0.20148026315789475, "grad_norm": 48.3864860534668, "learning_rate": 9.800986842105264e-06, "loss": 0.5425, "step": 490 }, { "epoch": 0.20559210526315788, "grad_norm": 21.042997360229492, "learning_rate": 9.796875000000001e-06, "loss": 0.6503, "step": 500 }, { "epoch": 0.20970394736842105, "grad_norm": 13.386051177978516, "learning_rate": 9.792763157894738e-06, "loss": 0.411, "step": 510 }, { "epoch": 0.2138157894736842, "grad_norm": 0.02692234143614769, "learning_rate": 9.788651315789474e-06, "loss": 0.8217, "step": 520 }, { "epoch": 0.21792763157894737, "grad_norm": 0.34673550724983215, "learning_rate": 9.784539473684212e-06, "loss": 0.6008, "step": 530 }, { "epoch": 0.22203947368421054, "grad_norm": 67.46952056884766, "learning_rate": 9.780427631578947e-06, "loss": 0.7634, "step": 540 }, { "epoch": 0.22615131578947367, "grad_norm": 1.0119798183441162, "learning_rate": 9.776315789473685e-06, "loss": 0.3242, "step": 550 }, { "epoch": 0.23026315789473684, "grad_norm": 31.881614685058594, "learning_rate": 9.772203947368422e-06, "loss": 0.8713, "step": 560 }, { "epoch": 0.234375, "grad_norm": 0.031223086640238762, "learning_rate": 9.768092105263158e-06, "loss": 0.4535, "step": 570 }, { "epoch": 0.23848684210526316, "grad_norm": 3.292686700820923, "learning_rate": 9.763980263157896e-06, "loss": 0.3269, "step": 580 }, { "epoch": 0.24259868421052633, "grad_norm": 44.72856521606445, "learning_rate": 9.759868421052633e-06, "loss": 0.6943, "step": 590 }, { "epoch": 0.24671052631578946, "grad_norm": 2.3683433532714844, "learning_rate": 9.75575657894737e-06, "loss": 0.4183, "step": 600 }, { "epoch": 0.25082236842105265, "grad_norm": 63.95539093017578, "learning_rate": 9.751644736842106e-06, "loss": 0.5693, "step": 610 }, { "epoch": 0.25493421052631576, "grad_norm": 60.73301315307617, "learning_rate": 9.747532894736844e-06, "loss": 1.1358, "step": 620 }, { "epoch": 0.2590460526315789, "grad_norm": 56.232845306396484, "learning_rate": 9.74342105263158e-06, "loss": 0.4148, "step": 630 }, { "epoch": 0.2631578947368421, "grad_norm": 39.33661651611328, "learning_rate": 9.739309210526317e-06, "loss": 0.6761, "step": 640 }, { "epoch": 0.26726973684210525, "grad_norm": 16.530174255371094, "learning_rate": 9.735197368421053e-06, "loss": 1.1621, "step": 650 }, { "epoch": 0.2713815789473684, "grad_norm": 8.869096755981445, "learning_rate": 9.73108552631579e-06, "loss": 0.6395, "step": 660 }, { "epoch": 0.2754934210526316, "grad_norm": 48.607025146484375, "learning_rate": 9.726973684210526e-06, "loss": 0.628, "step": 670 }, { "epoch": 0.27960526315789475, "grad_norm": 2.1920928955078125, "learning_rate": 9.722861842105263e-06, "loss": 0.572, "step": 680 }, { "epoch": 0.2837171052631579, "grad_norm": 1.4220335483551025, "learning_rate": 9.71875e-06, "loss": 0.2606, "step": 690 }, { "epoch": 0.2878289473684211, "grad_norm": 8.437749862670898, "learning_rate": 9.714638157894738e-06, "loss": 0.3371, "step": 700 }, { "epoch": 0.29194078947368424, "grad_norm": 1.182974100112915, "learning_rate": 9.710526315789474e-06, "loss": 0.9404, "step": 710 }, { "epoch": 0.29605263157894735, "grad_norm": 17.592485427856445, "learning_rate": 9.706414473684211e-06, "loss": 0.891, "step": 720 }, { "epoch": 0.3001644736842105, "grad_norm": 0.006371844094246626, "learning_rate": 9.702302631578949e-06, "loss": 0.5336, "step": 730 }, { "epoch": 0.3042763157894737, "grad_norm": 2.269035577774048, "learning_rate": 9.698190789473685e-06, "loss": 0.4996, "step": 740 }, { "epoch": 0.30838815789473684, "grad_norm": 3.4349544048309326, "learning_rate": 9.694078947368422e-06, "loss": 0.7136, "step": 750 }, { "epoch": 0.3125, "grad_norm": 31.934001922607422, "learning_rate": 9.689967105263158e-06, "loss": 0.6703, "step": 760 }, { "epoch": 0.31661184210526316, "grad_norm": 31.40448760986328, "learning_rate": 9.685855263157895e-06, "loss": 0.7493, "step": 770 }, { "epoch": 0.3207236842105263, "grad_norm": 0.03109549917280674, "learning_rate": 9.681743421052633e-06, "loss": 0.6885, "step": 780 }, { "epoch": 0.3248355263157895, "grad_norm": 62.230751037597656, "learning_rate": 9.67763157894737e-06, "loss": 0.4497, "step": 790 }, { "epoch": 0.32894736842105265, "grad_norm": 83.58551025390625, "learning_rate": 9.673519736842106e-06, "loss": 0.5014, "step": 800 }, { "epoch": 0.33305921052631576, "grad_norm": 1.6478629112243652, "learning_rate": 9.669407894736843e-06, "loss": 0.3641, "step": 810 }, { "epoch": 0.3371710526315789, "grad_norm": 0.02929232269525528, "learning_rate": 9.66529605263158e-06, "loss": 0.5607, "step": 820 }, { "epoch": 0.3412828947368421, "grad_norm": 32.008182525634766, "learning_rate": 9.661184210526317e-06, "loss": 0.4397, "step": 830 }, { "epoch": 0.34539473684210525, "grad_norm": 25.76505470275879, "learning_rate": 9.657072368421052e-06, "loss": 0.9786, "step": 840 }, { "epoch": 0.3495065789473684, "grad_norm": 0.04005062207579613, "learning_rate": 9.65296052631579e-06, "loss": 0.284, "step": 850 }, { "epoch": 0.3536184210526316, "grad_norm": 10.596315383911133, "learning_rate": 9.648848684210527e-06, "loss": 0.9688, "step": 860 }, { "epoch": 0.35773026315789475, "grad_norm": 46.72496795654297, "learning_rate": 9.644736842105263e-06, "loss": 0.594, "step": 870 }, { "epoch": 0.3618421052631579, "grad_norm": 43.54978942871094, "learning_rate": 9.640625e-06, "loss": 0.6063, "step": 880 }, { "epoch": 0.3659539473684211, "grad_norm": 2.5766873359680176, "learning_rate": 9.636513157894738e-06, "loss": 0.3333, "step": 890 }, { "epoch": 0.37006578947368424, "grad_norm": 58.31666564941406, "learning_rate": 9.632812500000001e-06, "loss": 0.8034, "step": 900 }, { "epoch": 0.37417763157894735, "grad_norm": 8.071229934692383, "learning_rate": 9.628700657894737e-06, "loss": 0.6695, "step": 910 }, { "epoch": 0.3782894736842105, "grad_norm": 24.1544132232666, "learning_rate": 9.624588815789474e-06, "loss": 0.6279, "step": 920 }, { "epoch": 0.3824013157894737, "grad_norm": 44.78126525878906, "learning_rate": 9.620476973684212e-06, "loss": 0.5958, "step": 930 }, { "epoch": 0.38651315789473684, "grad_norm": 0.43892815709114075, "learning_rate": 9.61636513157895e-06, "loss": 1.0484, "step": 940 }, { "epoch": 0.390625, "grad_norm": 59.37925720214844, "learning_rate": 9.612253289473685e-06, "loss": 0.9802, "step": 950 }, { "epoch": 0.39473684210526316, "grad_norm": 31.142057418823242, "learning_rate": 9.608141447368423e-06, "loss": 0.339, "step": 960 }, { "epoch": 0.3988486842105263, "grad_norm": 40.292205810546875, "learning_rate": 9.604029605263158e-06, "loss": 0.6531, "step": 970 }, { "epoch": 0.4029605263157895, "grad_norm": 38.27806854248047, "learning_rate": 9.599917763157896e-06, "loss": 0.7292, "step": 980 }, { "epoch": 0.40707236842105265, "grad_norm": 8.70391845703125, "learning_rate": 9.595805921052632e-06, "loss": 0.5728, "step": 990 }, { "epoch": 0.41118421052631576, "grad_norm": 26.568729400634766, "learning_rate": 9.591694078947369e-06, "loss": 0.6767, "step": 1000 }, { "epoch": 0.4152960526315789, "grad_norm": 54.55010986328125, "learning_rate": 9.587582236842106e-06, "loss": 0.7275, "step": 1010 }, { "epoch": 0.4194078947368421, "grad_norm": 55.86238098144531, "learning_rate": 9.583470394736842e-06, "loss": 0.6087, "step": 1020 }, { "epoch": 0.42351973684210525, "grad_norm": 67.50110626220703, "learning_rate": 9.57935855263158e-06, "loss": 1.4051, "step": 1030 }, { "epoch": 0.4276315789473684, "grad_norm": 30.782623291015625, "learning_rate": 9.575246710526317e-06, "loss": 0.75, "step": 1040 }, { "epoch": 0.4317434210526316, "grad_norm": 53.02436828613281, "learning_rate": 9.571134868421053e-06, "loss": 0.4739, "step": 1050 }, { "epoch": 0.43585526315789475, "grad_norm": 9.712882995605469, "learning_rate": 9.56702302631579e-06, "loss": 0.2168, "step": 1060 }, { "epoch": 0.4399671052631579, "grad_norm": 18.32390594482422, "learning_rate": 9.562911184210528e-06, "loss": 0.4673, "step": 1070 }, { "epoch": 0.4440789473684211, "grad_norm": 50.921531677246094, "learning_rate": 9.558799342105264e-06, "loss": 0.4137, "step": 1080 }, { "epoch": 0.44819078947368424, "grad_norm": 66.30414581298828, "learning_rate": 9.554687500000001e-06, "loss": 0.549, "step": 1090 }, { "epoch": 0.45230263157894735, "grad_norm": 0.11518318206071854, "learning_rate": 9.550575657894737e-06, "loss": 0.6499, "step": 1100 }, { "epoch": 0.4564144736842105, "grad_norm": 4.3409528732299805, "learning_rate": 9.546463815789474e-06, "loss": 0.9463, "step": 1110 }, { "epoch": 0.4605263157894737, "grad_norm": 30.97076416015625, "learning_rate": 9.542351973684212e-06, "loss": 0.6719, "step": 1120 }, { "epoch": 0.46463815789473684, "grad_norm": 13.25989818572998, "learning_rate": 9.538240131578949e-06, "loss": 0.4149, "step": 1130 }, { "epoch": 0.46875, "grad_norm": 1.624327540397644, "learning_rate": 9.534128289473685e-06, "loss": 0.7139, "step": 1140 }, { "epoch": 0.47286184210526316, "grad_norm": 1.2032309770584106, "learning_rate": 9.530016447368422e-06, "loss": 0.589, "step": 1150 }, { "epoch": 0.4769736842105263, "grad_norm": 125.88443756103516, "learning_rate": 9.525904605263158e-06, "loss": 0.5495, "step": 1160 }, { "epoch": 0.4810855263157895, "grad_norm": 31.080154418945312, "learning_rate": 9.521792763157896e-06, "loss": 0.7089, "step": 1170 }, { "epoch": 0.48519736842105265, "grad_norm": 39.169124603271484, "learning_rate": 9.517680921052633e-06, "loss": 0.8776, "step": 1180 }, { "epoch": 0.48930921052631576, "grad_norm": 157.91415405273438, "learning_rate": 9.513569078947369e-06, "loss": 0.713, "step": 1190 }, { "epoch": 0.4934210526315789, "grad_norm": 21.97187614440918, "learning_rate": 9.509457236842106e-06, "loss": 0.4497, "step": 1200 }, { "epoch": 0.4975328947368421, "grad_norm": 48.10848617553711, "learning_rate": 9.50575657894737e-06, "loss": 0.8274, "step": 1210 }, { "epoch": 0.5016447368421053, "grad_norm": 3.884164571762085, "learning_rate": 9.501644736842107e-06, "loss": 0.9336, "step": 1220 }, { "epoch": 0.5057565789473685, "grad_norm": 6.460564136505127, "learning_rate": 9.497532894736843e-06, "loss": 0.8036, "step": 1230 }, { "epoch": 0.5098684210526315, "grad_norm": 32.25432205200195, "learning_rate": 9.49342105263158e-06, "loss": 0.5282, "step": 1240 }, { "epoch": 0.5139802631578947, "grad_norm": 1.8110394477844238, "learning_rate": 9.489309210526316e-06, "loss": 0.1622, "step": 1250 }, { "epoch": 0.5180921052631579, "grad_norm": 66.42652893066406, "learning_rate": 9.485197368421053e-06, "loss": 1.3015, "step": 1260 }, { "epoch": 0.522203947368421, "grad_norm": 0.5288456678390503, "learning_rate": 9.481085526315789e-06, "loss": 1.0417, "step": 1270 }, { "epoch": 0.5263157894736842, "grad_norm": 43.16275405883789, "learning_rate": 9.476973684210528e-06, "loss": 0.2968, "step": 1280 }, { "epoch": 0.5304276315789473, "grad_norm": 2.046732187271118, "learning_rate": 9.472861842105264e-06, "loss": 0.9133, "step": 1290 }, { "epoch": 0.5345394736842105, "grad_norm": 0.09054580330848694, "learning_rate": 9.468750000000001e-06, "loss": 0.2698, "step": 1300 }, { "epoch": 0.5386513157894737, "grad_norm": 0.03111335076391697, "learning_rate": 9.464638157894737e-06, "loss": 0.2832, "step": 1310 }, { "epoch": 0.5427631578947368, "grad_norm": 78.9267578125, "learning_rate": 9.460526315789475e-06, "loss": 0.8709, "step": 1320 }, { "epoch": 0.546875, "grad_norm": 55.27962875366211, "learning_rate": 9.45641447368421e-06, "loss": 0.7564, "step": 1330 }, { "epoch": 0.5509868421052632, "grad_norm": 53.56753921508789, "learning_rate": 9.452302631578948e-06, "loss": 0.6463, "step": 1340 }, { "epoch": 0.5550986842105263, "grad_norm": 61.4496955871582, "learning_rate": 9.448190789473685e-06, "loss": 0.8631, "step": 1350 }, { "epoch": 0.5592105263157895, "grad_norm": 34.20602035522461, "learning_rate": 9.444078947368421e-06, "loss": 0.3161, "step": 1360 }, { "epoch": 0.5633223684210527, "grad_norm": 35.03656768798828, "learning_rate": 9.439967105263159e-06, "loss": 0.2618, "step": 1370 }, { "epoch": 0.5674342105263158, "grad_norm": 0.7599917054176331, "learning_rate": 9.435855263157896e-06, "loss": 0.9035, "step": 1380 }, { "epoch": 0.571546052631579, "grad_norm": 29.473997116088867, "learning_rate": 9.431743421052633e-06, "loss": 0.8102, "step": 1390 }, { "epoch": 0.5756578947368421, "grad_norm": 13.44682502746582, "learning_rate": 9.42763157894737e-06, "loss": 0.5657, "step": 1400 }, { "epoch": 0.5797697368421053, "grad_norm": 40.96477127075195, "learning_rate": 9.423519736842107e-06, "loss": 0.3194, "step": 1410 }, { "epoch": 0.5838815789473685, "grad_norm": 53.700321197509766, "learning_rate": 9.419407894736842e-06, "loss": 0.6133, "step": 1420 }, { "epoch": 0.5879934210526315, "grad_norm": 57.40962219238281, "learning_rate": 9.41529605263158e-06, "loss": 1.0728, "step": 1430 }, { "epoch": 0.5921052631578947, "grad_norm": 81.72367095947266, "learning_rate": 9.411184210526316e-06, "loss": 0.9684, "step": 1440 }, { "epoch": 0.5962171052631579, "grad_norm": 7.812744617462158, "learning_rate": 9.407072368421053e-06, "loss": 0.496, "step": 1450 }, { "epoch": 0.600328947368421, "grad_norm": 12.160445213317871, "learning_rate": 9.402960526315789e-06, "loss": 0.6885, "step": 1460 }, { "epoch": 0.6044407894736842, "grad_norm": 51.21662902832031, "learning_rate": 9.398848684210528e-06, "loss": 0.6898, "step": 1470 }, { "epoch": 0.6085526315789473, "grad_norm": 5.085622310638428, "learning_rate": 9.394736842105264e-06, "loss": 0.3369, "step": 1480 }, { "epoch": 0.6126644736842105, "grad_norm": 0.29845261573791504, "learning_rate": 9.390625000000001e-06, "loss": 1.106, "step": 1490 }, { "epoch": 0.6167763157894737, "grad_norm": 13.685811042785645, "learning_rate": 9.386513157894737e-06, "loss": 0.5248, "step": 1500 }, { "epoch": 0.6208881578947368, "grad_norm": 29.665889739990234, "learning_rate": 9.382401315789474e-06, "loss": 0.568, "step": 1510 }, { "epoch": 0.625, "grad_norm": 18.520368576049805, "learning_rate": 9.378289473684212e-06, "loss": 0.6348, "step": 1520 }, { "epoch": 0.6291118421052632, "grad_norm": 3.3405580520629883, "learning_rate": 9.374177631578948e-06, "loss": 0.595, "step": 1530 }, { "epoch": 0.6332236842105263, "grad_norm": 29.315523147583008, "learning_rate": 9.370065789473685e-06, "loss": 0.3298, "step": 1540 }, { "epoch": 0.6373355263157895, "grad_norm": 46.490848541259766, "learning_rate": 9.365953947368421e-06, "loss": 0.8335, "step": 1550 }, { "epoch": 0.6414473684210527, "grad_norm": 12.853484153747559, "learning_rate": 9.361842105263158e-06, "loss": 0.5452, "step": 1560 }, { "epoch": 0.6455592105263158, "grad_norm": 11.238713264465332, "learning_rate": 9.357730263157896e-06, "loss": 0.8813, "step": 1570 }, { "epoch": 0.649671052631579, "grad_norm": 1.7914977073669434, "learning_rate": 9.353618421052633e-06, "loss": 0.5766, "step": 1580 }, { "epoch": 0.6537828947368421, "grad_norm": 9.993241310119629, "learning_rate": 9.349506578947369e-06, "loss": 1.0686, "step": 1590 }, { "epoch": 0.6578947368421053, "grad_norm": 5.760276794433594, "learning_rate": 9.345394736842106e-06, "loss": 0.9441, "step": 1600 }, { "epoch": 0.6620065789473685, "grad_norm": 58.067134857177734, "learning_rate": 9.341282894736842e-06, "loss": 0.5564, "step": 1610 }, { "epoch": 0.6661184210526315, "grad_norm": 44.08545684814453, "learning_rate": 9.33717105263158e-06, "loss": 0.4461, "step": 1620 }, { "epoch": 0.6702302631578947, "grad_norm": 28.851512908935547, "learning_rate": 9.333059210526315e-06, "loss": 1.1042, "step": 1630 }, { "epoch": 0.6743421052631579, "grad_norm": 0.11420683562755585, "learning_rate": 9.328947368421053e-06, "loss": 0.7105, "step": 1640 }, { "epoch": 0.678453947368421, "grad_norm": 44.5568962097168, "learning_rate": 9.32483552631579e-06, "loss": 0.5754, "step": 1650 }, { "epoch": 0.6825657894736842, "grad_norm": 16.143844604492188, "learning_rate": 9.320723684210528e-06, "loss": 0.5564, "step": 1660 }, { "epoch": 0.6866776315789473, "grad_norm": 87.19486236572266, "learning_rate": 9.316611842105264e-06, "loss": 0.7184, "step": 1670 }, { "epoch": 0.6907894736842105, "grad_norm": 26.366031646728516, "learning_rate": 9.312500000000001e-06, "loss": 0.4315, "step": 1680 }, { "epoch": 0.6949013157894737, "grad_norm": 30.502588272094727, "learning_rate": 9.308388157894739e-06, "loss": 1.1223, "step": 1690 }, { "epoch": 0.6990131578947368, "grad_norm": 59.447601318359375, "learning_rate": 9.304276315789474e-06, "loss": 0.5216, "step": 1700 }, { "epoch": 0.703125, "grad_norm": 25.073314666748047, "learning_rate": 9.300164473684212e-06, "loss": 0.7732, "step": 1710 }, { "epoch": 0.7072368421052632, "grad_norm": 3.718965530395508, "learning_rate": 9.296052631578947e-06, "loss": 0.7764, "step": 1720 }, { "epoch": 0.7113486842105263, "grad_norm": 48.748783111572266, "learning_rate": 9.291940789473685e-06, "loss": 0.4773, "step": 1730 }, { "epoch": 0.7154605263157895, "grad_norm": 17.26929473876953, "learning_rate": 9.28782894736842e-06, "loss": 0.3774, "step": 1740 }, { "epoch": 0.7195723684210527, "grad_norm": 0.05121591314673424, "learning_rate": 9.28371710526316e-06, "loss": 0.7225, "step": 1750 }, { "epoch": 0.7236842105263158, "grad_norm": 1.7986981868743896, "learning_rate": 9.279605263157896e-06, "loss": 0.6779, "step": 1760 }, { "epoch": 0.727796052631579, "grad_norm": 77.67304229736328, "learning_rate": 9.275493421052633e-06, "loss": 0.3377, "step": 1770 }, { "epoch": 0.7319078947368421, "grad_norm": 12.0983304977417, "learning_rate": 9.271381578947369e-06, "loss": 0.7999, "step": 1780 }, { "epoch": 0.7360197368421053, "grad_norm": 0.6760322451591492, "learning_rate": 9.267269736842106e-06, "loss": 0.6168, "step": 1790 }, { "epoch": 0.7401315789473685, "grad_norm": 11.3161039352417, "learning_rate": 9.263157894736842e-06, "loss": 0.5886, "step": 1800 }, { "epoch": 0.7442434210526315, "grad_norm": 5.615332126617432, "learning_rate": 9.25904605263158e-06, "loss": 0.1935, "step": 1810 }, { "epoch": 0.7483552631578947, "grad_norm": 40.399017333984375, "learning_rate": 9.254934210526317e-06, "loss": 0.4364, "step": 1820 }, { "epoch": 0.7524671052631579, "grad_norm": 20.905866622924805, "learning_rate": 9.250822368421053e-06, "loss": 0.9716, "step": 1830 }, { "epoch": 0.756578947368421, "grad_norm": 50.7322883605957, "learning_rate": 9.24671052631579e-06, "loss": 0.4309, "step": 1840 }, { "epoch": 0.7606907894736842, "grad_norm": 36.55461883544922, "learning_rate": 9.242598684210528e-06, "loss": 1.0015, "step": 1850 }, { "epoch": 0.7648026315789473, "grad_norm": 55.39717483520508, "learning_rate": 9.238486842105265e-06, "loss": 1.1342, "step": 1860 }, { "epoch": 0.7689144736842105, "grad_norm": 47.303123474121094, "learning_rate": 9.234375e-06, "loss": 0.4452, "step": 1870 }, { "epoch": 0.7730263157894737, "grad_norm": 0.07194405794143677, "learning_rate": 9.230263157894738e-06, "loss": 0.6433, "step": 1880 }, { "epoch": 0.7771381578947368, "grad_norm": 80.85247039794922, "learning_rate": 9.226151315789474e-06, "loss": 0.5254, "step": 1890 }, { "epoch": 0.78125, "grad_norm": 27.962688446044922, "learning_rate": 9.222039473684212e-06, "loss": 0.92, "step": 1900 }, { "epoch": 0.7853618421052632, "grad_norm": 43.64583969116211, "learning_rate": 9.217927631578947e-06, "loss": 0.6327, "step": 1910 }, { "epoch": 0.7894736842105263, "grad_norm": 25.193130493164062, "learning_rate": 9.213815789473685e-06, "loss": 0.4634, "step": 1920 }, { "epoch": 0.7935855263157895, "grad_norm": 0.6646166443824768, "learning_rate": 9.20970394736842e-06, "loss": 0.2824, "step": 1930 }, { "epoch": 0.7976973684210527, "grad_norm": 66.69795989990234, "learning_rate": 9.20559210526316e-06, "loss": 0.6365, "step": 1940 }, { "epoch": 0.8018092105263158, "grad_norm": 26.328622817993164, "learning_rate": 9.201480263157895e-06, "loss": 0.6187, "step": 1950 }, { "epoch": 0.805921052631579, "grad_norm": 43.69908905029297, "learning_rate": 9.197368421052633e-06, "loss": 0.7507, "step": 1960 }, { "epoch": 0.8100328947368421, "grad_norm": 55.16326904296875, "learning_rate": 9.193256578947369e-06, "loss": 0.5324, "step": 1970 }, { "epoch": 0.8141447368421053, "grad_norm": 0.5004386901855469, "learning_rate": 9.189144736842106e-06, "loss": 0.3479, "step": 1980 }, { "epoch": 0.8182565789473685, "grad_norm": 0.6016833782196045, "learning_rate": 9.185032894736844e-06, "loss": 0.8248, "step": 1990 }, { "epoch": 0.8223684210526315, "grad_norm": 49.71751403808594, "learning_rate": 9.18092105263158e-06, "loss": 0.7017, "step": 2000 }, { "epoch": 0.8264802631578947, "grad_norm": 60.05244064331055, "learning_rate": 9.176809210526317e-06, "loss": 0.642, "step": 2010 }, { "epoch": 0.8305921052631579, "grad_norm": 70.1126708984375, "learning_rate": 9.172697368421053e-06, "loss": 0.9515, "step": 2020 }, { "epoch": 0.834703947368421, "grad_norm": 70.0270004272461, "learning_rate": 9.16858552631579e-06, "loss": 0.5299, "step": 2030 }, { "epoch": 0.8388157894736842, "grad_norm": 0.9353859424591064, "learning_rate": 9.164473684210527e-06, "loss": 0.8934, "step": 2040 }, { "epoch": 0.8429276315789473, "grad_norm": 0.3574899435043335, "learning_rate": 9.160361842105265e-06, "loss": 0.5327, "step": 2050 }, { "epoch": 0.8470394736842105, "grad_norm": 77.09919738769531, "learning_rate": 9.15625e-06, "loss": 1.1277, "step": 2060 }, { "epoch": 0.8511513157894737, "grad_norm": 94.8553695678711, "learning_rate": 9.152138157894738e-06, "loss": 1.0451, "step": 2070 }, { "epoch": 0.8552631578947368, "grad_norm": 6.773098945617676, "learning_rate": 9.148026315789474e-06, "loss": 0.5617, "step": 2080 }, { "epoch": 0.859375, "grad_norm": 3.3853366374969482, "learning_rate": 9.143914473684211e-06, "loss": 0.5929, "step": 2090 }, { "epoch": 0.8634868421052632, "grad_norm": 24.653453826904297, "learning_rate": 9.139802631578947e-06, "loss": 0.6193, "step": 2100 }, { "epoch": 0.8675986842105263, "grad_norm": 51.23694610595703, "learning_rate": 9.135690789473685e-06, "loss": 1.0407, "step": 2110 }, { "epoch": 0.8717105263157895, "grad_norm": 39.51556396484375, "learning_rate": 9.131578947368422e-06, "loss": 0.7252, "step": 2120 }, { "epoch": 0.8758223684210527, "grad_norm": 0.20115692913532257, "learning_rate": 9.12746710526316e-06, "loss": 0.6013, "step": 2130 }, { "epoch": 0.8799342105263158, "grad_norm": 58.84267044067383, "learning_rate": 9.123355263157895e-06, "loss": 0.686, "step": 2140 }, { "epoch": 0.884046052631579, "grad_norm": 0.0019536695908755064, "learning_rate": 9.119243421052633e-06, "loss": 0.2745, "step": 2150 }, { "epoch": 0.8881578947368421, "grad_norm": 66.56216430664062, "learning_rate": 9.11513157894737e-06, "loss": 0.7467, "step": 2160 }, { "epoch": 0.8922697368421053, "grad_norm": 4.7335100173950195, "learning_rate": 9.111019736842106e-06, "loss": 0.1942, "step": 2170 }, { "epoch": 0.8963815789473685, "grad_norm": 39.29569625854492, "learning_rate": 9.106907894736843e-06, "loss": 0.4847, "step": 2180 }, { "epoch": 0.9004934210526315, "grad_norm": 5.849125862121582, "learning_rate": 9.102796052631579e-06, "loss": 0.4814, "step": 2190 }, { "epoch": 0.9046052631578947, "grad_norm": 15.409236907958984, "learning_rate": 9.098684210526317e-06, "loss": 0.2912, "step": 2200 }, { "epoch": 0.9087171052631579, "grad_norm": 0.6259574890136719, "learning_rate": 9.094572368421052e-06, "loss": 0.6347, "step": 2210 }, { "epoch": 0.912828947368421, "grad_norm": 65.06391143798828, "learning_rate": 9.09046052631579e-06, "loss": 0.3179, "step": 2220 }, { "epoch": 0.9169407894736842, "grad_norm": 0.6020347476005554, "learning_rate": 9.086348684210527e-06, "loss": 0.6884, "step": 2230 }, { "epoch": 0.9210526315789473, "grad_norm": 2.7630629539489746, "learning_rate": 9.082236842105265e-06, "loss": 0.7149, "step": 2240 }, { "epoch": 0.9251644736842105, "grad_norm": 12.887619972229004, "learning_rate": 9.078125e-06, "loss": 0.59, "step": 2250 }, { "epoch": 0.9292763157894737, "grad_norm": 49.26618194580078, "learning_rate": 9.074013157894738e-06, "loss": 0.4737, "step": 2260 }, { "epoch": 0.9333881578947368, "grad_norm": 1.3609040975570679, "learning_rate": 9.069901315789474e-06, "loss": 0.4908, "step": 2270 }, { "epoch": 0.9375, "grad_norm": 56.066932678222656, "learning_rate": 9.065789473684211e-06, "loss": 0.7488, "step": 2280 }, { "epoch": 0.9416118421052632, "grad_norm": 79.9988784790039, "learning_rate": 9.061677631578949e-06, "loss": 0.7794, "step": 2290 }, { "epoch": 0.9457236842105263, "grad_norm": 0.05063479766249657, "learning_rate": 9.057565789473684e-06, "loss": 0.2889, "step": 2300 }, { "epoch": 0.9498355263157895, "grad_norm": 55.820369720458984, "learning_rate": 9.053453947368422e-06, "loss": 0.7358, "step": 2310 }, { "epoch": 0.9539473684210527, "grad_norm": 84.37875366210938, "learning_rate": 9.04934210526316e-06, "loss": 0.8118, "step": 2320 }, { "epoch": 0.9580592105263158, "grad_norm": 69.74197387695312, "learning_rate": 9.045230263157897e-06, "loss": 1.1243, "step": 2330 }, { "epoch": 0.962171052631579, "grad_norm": 0.7944409251213074, "learning_rate": 9.041118421052632e-06, "loss": 0.1485, "step": 2340 }, { "epoch": 0.9662828947368421, "grad_norm": 9.75572681427002, "learning_rate": 9.03700657894737e-06, "loss": 0.6436, "step": 2350 }, { "epoch": 0.9703947368421053, "grad_norm": 0.3801949918270111, "learning_rate": 9.032894736842106e-06, "loss": 0.6525, "step": 2360 }, { "epoch": 0.9745065789473685, "grad_norm": 0.04239730164408684, "learning_rate": 9.028782894736843e-06, "loss": 0.4073, "step": 2370 }, { "epoch": 0.9786184210526315, "grad_norm": 0.5113763809204102, "learning_rate": 9.024671052631579e-06, "loss": 0.4306, "step": 2380 }, { "epoch": 0.9827302631578947, "grad_norm": 67.6680679321289, "learning_rate": 9.020559210526316e-06, "loss": 0.8971, "step": 2390 }, { "epoch": 0.9868421052631579, "grad_norm": 64.32469177246094, "learning_rate": 9.016447368421054e-06, "loss": 0.4565, "step": 2400 }, { "epoch": 0.990953947368421, "grad_norm": 2.2221884727478027, "learning_rate": 9.01233552631579e-06, "loss": 0.5263, "step": 2410 }, { "epoch": 0.9950657894736842, "grad_norm": 7.352302551269531, "learning_rate": 9.008223684210527e-06, "loss": 0.6523, "step": 2420 }, { "epoch": 0.9991776315789473, "grad_norm": 66.60418701171875, "learning_rate": 9.004111842105264e-06, "loss": 0.401, "step": 2430 }, { "epoch": 1.0032894736842106, "grad_norm": 10.393697738647461, "learning_rate": 9e-06, "loss": 0.3328, "step": 2440 }, { "epoch": 1.0074013157894737, "grad_norm": 33.00063705444336, "learning_rate": 8.995888157894738e-06, "loss": 0.1872, "step": 2450 }, { "epoch": 1.011513157894737, "grad_norm": 53.126712799072266, "learning_rate": 8.991776315789475e-06, "loss": 1.1614, "step": 2460 }, { "epoch": 1.015625, "grad_norm": 0.4837985634803772, "learning_rate": 8.987664473684211e-06, "loss": 0.2477, "step": 2470 }, { "epoch": 1.019736842105263, "grad_norm": 5.717953681945801, "learning_rate": 8.983552631578948e-06, "loss": 0.1087, "step": 2480 }, { "epoch": 1.0238486842105263, "grad_norm": 13.705921173095703, "learning_rate": 8.979440789473684e-06, "loss": 0.1692, "step": 2490 }, { "epoch": 1.0279605263157894, "grad_norm": 61.79713439941406, "learning_rate": 8.975328947368422e-06, "loss": 0.6781, "step": 2500 }, { "epoch": 1.0320723684210527, "grad_norm": 5.752686023712158, "learning_rate": 8.971217105263159e-06, "loss": 0.5315, "step": 2510 }, { "epoch": 1.0361842105263157, "grad_norm": 23.530588150024414, "learning_rate": 8.967105263157896e-06, "loss": 0.585, "step": 2520 }, { "epoch": 1.040296052631579, "grad_norm": 0.11146017163991928, "learning_rate": 8.962993421052632e-06, "loss": 0.4786, "step": 2530 }, { "epoch": 1.044407894736842, "grad_norm": 4.48652458190918, "learning_rate": 8.95888157894737e-06, "loss": 0.659, "step": 2540 }, { "epoch": 1.0485197368421053, "grad_norm": 3.888078451156616, "learning_rate": 8.954769736842105e-06, "loss": 0.1793, "step": 2550 }, { "epoch": 1.0526315789473684, "grad_norm": 0.5999749302864075, "learning_rate": 8.950657894736843e-06, "loss": 0.3992, "step": 2560 }, { "epoch": 1.0567434210526316, "grad_norm": 20.677486419677734, "learning_rate": 8.946546052631579e-06, "loss": 0.4838, "step": 2570 }, { "epoch": 1.0608552631578947, "grad_norm": 0.10241834819316864, "learning_rate": 8.942434210526316e-06, "loss": 0.6778, "step": 2580 }, { "epoch": 1.064967105263158, "grad_norm": 87.81678009033203, "learning_rate": 8.938322368421054e-06, "loss": 0.4954, "step": 2590 }, { "epoch": 1.069078947368421, "grad_norm": 109.55130767822266, "learning_rate": 8.93421052631579e-06, "loss": 0.489, "step": 2600 }, { "epoch": 1.0731907894736843, "grad_norm": 52.265106201171875, "learning_rate": 8.930098684210527e-06, "loss": 0.5496, "step": 2610 }, { "epoch": 1.0773026315789473, "grad_norm": 4.881550312042236, "learning_rate": 8.925986842105264e-06, "loss": 0.3602, "step": 2620 }, { "epoch": 1.0814144736842106, "grad_norm": 59.2353401184082, "learning_rate": 8.921875000000002e-06, "loss": 0.7403, "step": 2630 }, { "epoch": 1.0855263157894737, "grad_norm": 64.6301040649414, "learning_rate": 8.917763157894737e-06, "loss": 0.6835, "step": 2640 }, { "epoch": 1.0896381578947367, "grad_norm": 45.62979507446289, "learning_rate": 8.913651315789475e-06, "loss": 0.4795, "step": 2650 }, { "epoch": 1.09375, "grad_norm": 18.216320037841797, "learning_rate": 8.90953947368421e-06, "loss": 0.557, "step": 2660 }, { "epoch": 1.0978618421052633, "grad_norm": 0.05309246852993965, "learning_rate": 8.905427631578948e-06, "loss": 0.3478, "step": 2670 }, { "epoch": 1.1019736842105263, "grad_norm": 0.35603123903274536, "learning_rate": 8.901315789473684e-06, "loss": 0.1215, "step": 2680 }, { "epoch": 1.1060855263157894, "grad_norm": 3.711970567703247, "learning_rate": 8.897203947368421e-06, "loss": 0.2544, "step": 2690 }, { "epoch": 1.1101973684210527, "grad_norm": 5.010875225067139, "learning_rate": 8.893092105263159e-06, "loss": 0.2661, "step": 2700 }, { "epoch": 1.1143092105263157, "grad_norm": 51.760868072509766, "learning_rate": 8.888980263157896e-06, "loss": 0.3528, "step": 2710 }, { "epoch": 1.118421052631579, "grad_norm": 0.11445479840040207, "learning_rate": 8.884868421052632e-06, "loss": 0.542, "step": 2720 }, { "epoch": 1.122532894736842, "grad_norm": 4.898531913757324, "learning_rate": 8.88075657894737e-06, "loss": 0.7877, "step": 2730 }, { "epoch": 1.1266447368421053, "grad_norm": 12.105345726013184, "learning_rate": 8.876644736842105e-06, "loss": 0.7329, "step": 2740 }, { "epoch": 1.1307565789473684, "grad_norm": 6.530974864959717, "learning_rate": 8.872532894736843e-06, "loss": 0.2338, "step": 2750 }, { "epoch": 1.1348684210526316, "grad_norm": 5.893588066101074, "learning_rate": 8.86842105263158e-06, "loss": 0.7787, "step": 2760 }, { "epoch": 1.1389802631578947, "grad_norm": 52.82911682128906, "learning_rate": 8.864309210526316e-06, "loss": 0.2331, "step": 2770 }, { "epoch": 1.143092105263158, "grad_norm": 15.883142471313477, "learning_rate": 8.860197368421053e-06, "loss": 0.3743, "step": 2780 }, { "epoch": 1.147203947368421, "grad_norm": 99.91140747070312, "learning_rate": 8.856085526315789e-06, "loss": 0.1293, "step": 2790 }, { "epoch": 1.1513157894736843, "grad_norm": 0.10743849724531174, "learning_rate": 8.851973684210528e-06, "loss": 0.1448, "step": 2800 }, { "epoch": 1.1554276315789473, "grad_norm": 0.12402098625898361, "learning_rate": 8.847861842105264e-06, "loss": 0.2003, "step": 2810 }, { "epoch": 1.1595394736842106, "grad_norm": 0.9490414261817932, "learning_rate": 8.843750000000002e-06, "loss": 0.1746, "step": 2820 }, { "epoch": 1.1636513157894737, "grad_norm": 8.182841300964355, "learning_rate": 8.839638157894737e-06, "loss": 0.2, "step": 2830 }, { "epoch": 1.1677631578947367, "grad_norm": 29.3283748626709, "learning_rate": 8.835526315789475e-06, "loss": 1.0814, "step": 2840 }, { "epoch": 1.171875, "grad_norm": 0.44098392128944397, "learning_rate": 8.83141447368421e-06, "loss": 0.1534, "step": 2850 }, { "epoch": 1.1759868421052633, "grad_norm": 26.898929595947266, "learning_rate": 8.827302631578948e-06, "loss": 0.3173, "step": 2860 }, { "epoch": 1.1800986842105263, "grad_norm": 49.62389373779297, "learning_rate": 8.823190789473685e-06, "loss": 0.6717, "step": 2870 }, { "epoch": 1.1842105263157894, "grad_norm": 54.11696243286133, "learning_rate": 8.819078947368421e-06, "loss": 0.4769, "step": 2880 }, { "epoch": 1.1883223684210527, "grad_norm": 0.8223657011985779, "learning_rate": 8.814967105263159e-06, "loss": 0.1133, "step": 2890 }, { "epoch": 1.1924342105263157, "grad_norm": 0.30546388030052185, "learning_rate": 8.810855263157896e-06, "loss": 0.1435, "step": 2900 }, { "epoch": 1.196546052631579, "grad_norm": 0.005724355112761259, "learning_rate": 8.806743421052632e-06, "loss": 0.0993, "step": 2910 }, { "epoch": 1.200657894736842, "grad_norm": 2.1313202381134033, "learning_rate": 8.80263157894737e-06, "loss": 0.301, "step": 2920 }, { "epoch": 1.2047697368421053, "grad_norm": 2.2508060932159424, "learning_rate": 8.798519736842107e-06, "loss": 0.3936, "step": 2930 }, { "epoch": 1.2088815789473684, "grad_norm": 17.970064163208008, "learning_rate": 8.794407894736843e-06, "loss": 0.2133, "step": 2940 }, { "epoch": 1.2129934210526316, "grad_norm": 3.051180839538574, "learning_rate": 8.79029605263158e-06, "loss": 0.3686, "step": 2950 }, { "epoch": 1.2171052631578947, "grad_norm": 63.62922286987305, "learning_rate": 8.786184210526316e-06, "loss": 0.6775, "step": 2960 }, { "epoch": 1.221217105263158, "grad_norm": 91.9775390625, "learning_rate": 8.782072368421053e-06, "loss": 0.1896, "step": 2970 }, { "epoch": 1.225328947368421, "grad_norm": 13.494383811950684, "learning_rate": 8.777960526315789e-06, "loss": 0.446, "step": 2980 }, { "epoch": 1.2294407894736843, "grad_norm": 12.346921920776367, "learning_rate": 8.773848684210528e-06, "loss": 0.2766, "step": 2990 }, { "epoch": 1.2335526315789473, "grad_norm": 69.7927017211914, "learning_rate": 8.769736842105264e-06, "loss": 0.7206, "step": 3000 }, { "epoch": 1.2376644736842106, "grad_norm": 0.10051664710044861, "learning_rate": 8.765625000000001e-06, "loss": 0.8094, "step": 3010 }, { "epoch": 1.2417763157894737, "grad_norm": 73.73413848876953, "learning_rate": 8.761513157894737e-06, "loss": 0.3381, "step": 3020 }, { "epoch": 1.2458881578947367, "grad_norm": 5.243391513824463, "learning_rate": 8.757401315789475e-06, "loss": 0.424, "step": 3030 }, { "epoch": 1.25, "grad_norm": 0.6941500902175903, "learning_rate": 8.75328947368421e-06, "loss": 0.3621, "step": 3040 }, { "epoch": 1.2541118421052633, "grad_norm": 1.7824820280075073, "learning_rate": 8.749177631578948e-06, "loss": 0.5639, "step": 3050 }, { "epoch": 1.2582236842105263, "grad_norm": 74.03777313232422, "learning_rate": 8.745065789473685e-06, "loss": 0.3756, "step": 3060 }, { "epoch": 1.2623355263157894, "grad_norm": 63.435646057128906, "learning_rate": 8.740953947368421e-06, "loss": 0.4663, "step": 3070 }, { "epoch": 1.2664473684210527, "grad_norm": 20.51471519470215, "learning_rate": 8.736842105263158e-06, "loss": 0.5241, "step": 3080 }, { "epoch": 1.2705592105263157, "grad_norm": 31.2130069732666, "learning_rate": 8.732730263157896e-06, "loss": 0.049, "step": 3090 }, { "epoch": 1.274671052631579, "grad_norm": 0.4100927412509918, "learning_rate": 8.728618421052633e-06, "loss": 0.6768, "step": 3100 }, { "epoch": 1.278782894736842, "grad_norm": 8.450210571289062, "learning_rate": 8.724506578947369e-06, "loss": 0.2578, "step": 3110 }, { "epoch": 1.2828947368421053, "grad_norm": 47.065731048583984, "learning_rate": 8.720394736842107e-06, "loss": 0.2957, "step": 3120 }, { "epoch": 1.2870065789473684, "grad_norm": 20.626476287841797, "learning_rate": 8.716282894736842e-06, "loss": 0.0919, "step": 3130 }, { "epoch": 1.2911184210526316, "grad_norm": 2.748300313949585, "learning_rate": 8.71217105263158e-06, "loss": 0.3204, "step": 3140 }, { "epoch": 1.2952302631578947, "grad_norm": 18.729766845703125, "learning_rate": 8.708059210526316e-06, "loss": 0.1864, "step": 3150 }, { "epoch": 1.299342105263158, "grad_norm": 2.0139780044555664, "learning_rate": 8.703947368421053e-06, "loss": 0.1559, "step": 3160 }, { "epoch": 1.303453947368421, "grad_norm": 2.845900774002075, "learning_rate": 8.69983552631579e-06, "loss": 0.6238, "step": 3170 }, { "epoch": 1.3075657894736843, "grad_norm": 46.37712860107422, "learning_rate": 8.695723684210528e-06, "loss": 0.2323, "step": 3180 }, { "epoch": 1.3116776315789473, "grad_norm": 271.55926513671875, "learning_rate": 8.691611842105264e-06, "loss": 0.2997, "step": 3190 }, { "epoch": 1.3157894736842106, "grad_norm": 0.07383125275373459, "learning_rate": 8.687500000000001e-06, "loss": 0.3775, "step": 3200 }, { "epoch": 1.3199013157894737, "grad_norm": 0.09837435185909271, "learning_rate": 8.683388157894737e-06, "loss": 0.7621, "step": 3210 }, { "epoch": 1.3240131578947367, "grad_norm": 9.785972595214844, "learning_rate": 8.679276315789474e-06, "loss": 0.1313, "step": 3220 }, { "epoch": 1.328125, "grad_norm": 12.865886688232422, "learning_rate": 8.675164473684212e-06, "loss": 0.3697, "step": 3230 }, { "epoch": 1.3322368421052633, "grad_norm": 65.38900756835938, "learning_rate": 8.671052631578948e-06, "loss": 0.1807, "step": 3240 }, { "epoch": 1.3363486842105263, "grad_norm": 21.760971069335938, "learning_rate": 8.666940789473685e-06, "loss": 0.427, "step": 3250 }, { "epoch": 1.3404605263157894, "grad_norm": 0.22508497536182404, "learning_rate": 8.66282894736842e-06, "loss": 0.2474, "step": 3260 }, { "epoch": 1.3445723684210527, "grad_norm": 0.027158482000231743, "learning_rate": 8.65871710526316e-06, "loss": 0.0662, "step": 3270 }, { "epoch": 1.3486842105263157, "grad_norm": 0.5353153944015503, "learning_rate": 8.654605263157896e-06, "loss": 0.6432, "step": 3280 }, { "epoch": 1.352796052631579, "grad_norm": 34.469608306884766, "learning_rate": 8.650493421052633e-06, "loss": 0.2062, "step": 3290 }, { "epoch": 1.356907894736842, "grad_norm": 1.8599954843521118, "learning_rate": 8.646381578947369e-06, "loss": 0.4208, "step": 3300 }, { "epoch": 1.3610197368421053, "grad_norm": 38.51445770263672, "learning_rate": 8.642269736842106e-06, "loss": 0.2114, "step": 3310 }, { "epoch": 1.3651315789473684, "grad_norm": 69.46410369873047, "learning_rate": 8.638157894736842e-06, "loss": 0.6396, "step": 3320 }, { "epoch": 1.3692434210526316, "grad_norm": 5.565284729003906, "learning_rate": 8.63404605263158e-06, "loss": 0.6757, "step": 3330 }, { "epoch": 1.3733552631578947, "grad_norm": 5.994679927825928, "learning_rate": 8.629934210526317e-06, "loss": 0.2594, "step": 3340 }, { "epoch": 1.377467105263158, "grad_norm": 30.572546005249023, "learning_rate": 8.625822368421053e-06, "loss": 0.2005, "step": 3350 }, { "epoch": 1.381578947368421, "grad_norm": 78.56243896484375, "learning_rate": 8.62171052631579e-06, "loss": 0.6431, "step": 3360 }, { "epoch": 1.3856907894736843, "grad_norm": 54.2548828125, "learning_rate": 8.617598684210528e-06, "loss": 0.3055, "step": 3370 }, { "epoch": 1.3898026315789473, "grad_norm": 0.4007514715194702, "learning_rate": 8.613486842105263e-06, "loss": 0.3877, "step": 3380 }, { "epoch": 1.3939144736842106, "grad_norm": 20.567773818969727, "learning_rate": 8.609375000000001e-06, "loss": 0.483, "step": 3390 }, { "epoch": 1.3980263157894737, "grad_norm": 57.93979263305664, "learning_rate": 8.605263157894738e-06, "loss": 0.5153, "step": 3400 }, { "epoch": 1.4021381578947367, "grad_norm": 0.26199084520339966, "learning_rate": 8.601151315789474e-06, "loss": 0.3544, "step": 3410 }, { "epoch": 1.40625, "grad_norm": 13.385703086853027, "learning_rate": 8.597039473684212e-06, "loss": 0.5427, "step": 3420 }, { "epoch": 1.4103618421052633, "grad_norm": 9.025935173034668, "learning_rate": 8.592927631578947e-06, "loss": 0.3185, "step": 3430 }, { "epoch": 1.4144736842105263, "grad_norm": 53.11691665649414, "learning_rate": 8.588815789473685e-06, "loss": 0.3702, "step": 3440 }, { "epoch": 1.4185855263157894, "grad_norm": 0.0294374767690897, "learning_rate": 8.58470394736842e-06, "loss": 0.3489, "step": 3450 }, { "epoch": 1.4226973684210527, "grad_norm": 73.63797760009766, "learning_rate": 8.58059210526316e-06, "loss": 0.1992, "step": 3460 }, { "epoch": 1.4268092105263157, "grad_norm": 10.585562705993652, "learning_rate": 8.576480263157895e-06, "loss": 0.2284, "step": 3470 }, { "epoch": 1.430921052631579, "grad_norm": 2.052852153778076, "learning_rate": 8.572368421052633e-06, "loss": 0.1912, "step": 3480 }, { "epoch": 1.435032894736842, "grad_norm": 0.003717209445312619, "learning_rate": 8.568256578947369e-06, "loss": 0.5507, "step": 3490 }, { "epoch": 1.4391447368421053, "grad_norm": 0.7865049242973328, "learning_rate": 8.564144736842106e-06, "loss": 0.0211, "step": 3500 }, { "epoch": 1.4432565789473684, "grad_norm": 15.986753463745117, "learning_rate": 8.560032894736842e-06, "loss": 0.2161, "step": 3510 }, { "epoch": 1.4473684210526316, "grad_norm": 6.693492412567139, "learning_rate": 8.55592105263158e-06, "loss": 0.3488, "step": 3520 }, { "epoch": 1.4514802631578947, "grad_norm": 39.255794525146484, "learning_rate": 8.551809210526317e-06, "loss": 0.4337, "step": 3530 }, { "epoch": 1.455592105263158, "grad_norm": 114.03556823730469, "learning_rate": 8.547697368421053e-06, "loss": 0.4609, "step": 3540 }, { "epoch": 1.459703947368421, "grad_norm": 0.11433131992816925, "learning_rate": 8.54358552631579e-06, "loss": 0.2146, "step": 3550 }, { "epoch": 1.4638157894736843, "grad_norm": 71.40312957763672, "learning_rate": 8.539473684210527e-06, "loss": 0.3865, "step": 3560 }, { "epoch": 1.4679276315789473, "grad_norm": 0.18895581364631653, "learning_rate": 8.535361842105265e-06, "loss": 0.5562, "step": 3570 }, { "epoch": 1.4720394736842106, "grad_norm": 16.85786247253418, "learning_rate": 8.53125e-06, "loss": 0.2986, "step": 3580 }, { "epoch": 1.4761513157894737, "grad_norm": 0.0461287759244442, "learning_rate": 8.527138157894738e-06, "loss": 0.5559, "step": 3590 }, { "epoch": 1.4802631578947367, "grad_norm": 63.416690826416016, "learning_rate": 8.523026315789474e-06, "loss": 0.4644, "step": 3600 }, { "epoch": 1.484375, "grad_norm": 96.18208312988281, "learning_rate": 8.518914473684211e-06, "loss": 0.4958, "step": 3610 }, { "epoch": 1.4884868421052633, "grad_norm": 2.9224765300750732, "learning_rate": 8.514802631578947e-06, "loss": 0.2215, "step": 3620 }, { "epoch": 1.4925986842105263, "grad_norm": 90.86573791503906, "learning_rate": 8.510690789473685e-06, "loss": 0.4001, "step": 3630 }, { "epoch": 1.4967105263157894, "grad_norm": 0.019238781183958054, "learning_rate": 8.506578947368422e-06, "loss": 0.2813, "step": 3640 }, { "epoch": 1.5008223684210527, "grad_norm": 64.77748107910156, "learning_rate": 8.50246710526316e-06, "loss": 0.4256, "step": 3650 }, { "epoch": 1.504934210526316, "grad_norm": 66.73531341552734, "learning_rate": 8.498355263157895e-06, "loss": 0.5352, "step": 3660 }, { "epoch": 1.509046052631579, "grad_norm": 30.41202163696289, "learning_rate": 8.494243421052633e-06, "loss": 0.4228, "step": 3670 }, { "epoch": 1.513157894736842, "grad_norm": 48.70793914794922, "learning_rate": 8.490131578947368e-06, "loss": 0.4549, "step": 3680 }, { "epoch": 1.5172697368421053, "grad_norm": 52.415306091308594, "learning_rate": 8.486019736842106e-06, "loss": 0.5359, "step": 3690 }, { "epoch": 1.5213815789473686, "grad_norm": 0.15045365691184998, "learning_rate": 8.481907894736843e-06, "loss": 0.2304, "step": 3700 }, { "epoch": 1.5254934210526314, "grad_norm": 0.844916582107544, "learning_rate": 8.477796052631579e-06, "loss": 0.7176, "step": 3710 }, { "epoch": 1.5296052631578947, "grad_norm": 1.8648791313171387, "learning_rate": 8.473684210526317e-06, "loss": 0.2773, "step": 3720 }, { "epoch": 1.533717105263158, "grad_norm": 0.052761100232601166, "learning_rate": 8.469572368421052e-06, "loss": 0.0183, "step": 3730 }, { "epoch": 1.537828947368421, "grad_norm": 54.12556838989258, "learning_rate": 8.46546052631579e-06, "loss": 0.5591, "step": 3740 }, { "epoch": 1.541940789473684, "grad_norm": 0.6858662366867065, "learning_rate": 8.461348684210527e-06, "loss": 0.1663, "step": 3750 }, { "epoch": 1.5460526315789473, "grad_norm": 0.3100195527076721, "learning_rate": 8.457236842105265e-06, "loss": 0.5894, "step": 3760 }, { "epoch": 1.5501644736842106, "grad_norm": 29.110191345214844, "learning_rate": 8.453125e-06, "loss": 0.6844, "step": 3770 }, { "epoch": 1.5542763157894737, "grad_norm": 84.61830139160156, "learning_rate": 8.449013157894738e-06, "loss": 0.8012, "step": 3780 }, { "epoch": 1.5583881578947367, "grad_norm": 69.99295043945312, "learning_rate": 8.444901315789474e-06, "loss": 0.4886, "step": 3790 }, { "epoch": 1.5625, "grad_norm": 28.908971786499023, "learning_rate": 8.440789473684211e-06, "loss": 0.3999, "step": 3800 }, { "epoch": 1.5666118421052633, "grad_norm": 0.005273888353258371, "learning_rate": 8.436677631578949e-06, "loss": 0.5796, "step": 3810 }, { "epoch": 1.5707236842105263, "grad_norm": 0.5902597308158875, "learning_rate": 8.432565789473684e-06, "loss": 0.4326, "step": 3820 }, { "epoch": 1.5748355263157894, "grad_norm": 17.558439254760742, "learning_rate": 8.428453947368422e-06, "loss": 0.1512, "step": 3830 }, { "epoch": 1.5789473684210527, "grad_norm": 28.71281623840332, "learning_rate": 8.42434210526316e-06, "loss": 0.0765, "step": 3840 }, { "epoch": 1.583059210526316, "grad_norm": 0.1115703135728836, "learning_rate": 8.420230263157895e-06, "loss": 0.8147, "step": 3850 }, { "epoch": 1.587171052631579, "grad_norm": 105.5567626953125, "learning_rate": 8.416118421052633e-06, "loss": 0.7391, "step": 3860 }, { "epoch": 1.591282894736842, "grad_norm": 15.565473556518555, "learning_rate": 8.41200657894737e-06, "loss": 0.6355, "step": 3870 }, { "epoch": 1.5953947368421053, "grad_norm": 29.326662063598633, "learning_rate": 8.407894736842106e-06, "loss": 0.3711, "step": 3880 }, { "epoch": 1.5995065789473686, "grad_norm": 37.79521560668945, "learning_rate": 8.403782894736843e-06, "loss": 0.302, "step": 3890 }, { "epoch": 1.6036184210526314, "grad_norm": 9.726346015930176, "learning_rate": 8.399671052631579e-06, "loss": 0.6002, "step": 3900 }, { "epoch": 1.6077302631578947, "grad_norm": 47.580997467041016, "learning_rate": 8.395559210526316e-06, "loss": 0.5547, "step": 3910 }, { "epoch": 1.611842105263158, "grad_norm": 35.97808837890625, "learning_rate": 8.391447368421052e-06, "loss": 0.489, "step": 3920 }, { "epoch": 1.615953947368421, "grad_norm": 16.187610626220703, "learning_rate": 8.38733552631579e-06, "loss": 0.194, "step": 3930 }, { "epoch": 1.620065789473684, "grad_norm": 53.040794372558594, "learning_rate": 8.383223684210527e-06, "loss": 0.2443, "step": 3940 }, { "epoch": 1.6241776315789473, "grad_norm": 55.21927261352539, "learning_rate": 8.379111842105265e-06, "loss": 0.5103, "step": 3950 }, { "epoch": 1.6282894736842106, "grad_norm": 41.11189651489258, "learning_rate": 8.375e-06, "loss": 0.4835, "step": 3960 }, { "epoch": 1.6324013157894737, "grad_norm": 3.0493428707122803, "learning_rate": 8.370888157894738e-06, "loss": 0.2295, "step": 3970 }, { "epoch": 1.6365131578947367, "grad_norm": 4.010314464569092, "learning_rate": 8.366776315789475e-06, "loss": 0.2899, "step": 3980 }, { "epoch": 1.640625, "grad_norm": 1.733955979347229, "learning_rate": 8.362664473684211e-06, "loss": 0.3304, "step": 3990 }, { "epoch": 1.6447368421052633, "grad_norm": 0.39239126443862915, "learning_rate": 8.358552631578948e-06, "loss": 0.7172, "step": 4000 }, { "epoch": 1.6488486842105263, "grad_norm": 19.84231185913086, "learning_rate": 8.354440789473684e-06, "loss": 0.6881, "step": 4010 }, { "epoch": 1.6529605263157894, "grad_norm": 3.06638503074646, "learning_rate": 8.350328947368422e-06, "loss": 0.314, "step": 4020 }, { "epoch": 1.6570723684210527, "grad_norm": 0.9069366455078125, "learning_rate": 8.346217105263159e-06, "loss": 0.3215, "step": 4030 }, { "epoch": 1.661184210526316, "grad_norm": 61.60841369628906, "learning_rate": 8.342105263157897e-06, "loss": 0.5228, "step": 4040 }, { "epoch": 1.665296052631579, "grad_norm": 0.013282466679811478, "learning_rate": 8.337993421052632e-06, "loss": 0.4712, "step": 4050 }, { "epoch": 1.669407894736842, "grad_norm": 2.0385208129882812, "learning_rate": 8.33388157894737e-06, "loss": 0.0717, "step": 4060 }, { "epoch": 1.6735197368421053, "grad_norm": 31.112775802612305, "learning_rate": 8.329769736842106e-06, "loss": 0.2635, "step": 4070 }, { "epoch": 1.6776315789473686, "grad_norm": 83.4158935546875, "learning_rate": 8.325657894736843e-06, "loss": 0.3961, "step": 4080 }, { "epoch": 1.6817434210526314, "grad_norm": 0.01563643105328083, "learning_rate": 8.321546052631579e-06, "loss": 0.142, "step": 4090 }, { "epoch": 1.6858552631578947, "grad_norm": 0.2154001146554947, "learning_rate": 8.317434210526316e-06, "loss": 0.0995, "step": 4100 }, { "epoch": 1.689967105263158, "grad_norm": 0.23853525519371033, "learning_rate": 8.313322368421054e-06, "loss": 0.2759, "step": 4110 }, { "epoch": 1.694078947368421, "grad_norm": 35.838157653808594, "learning_rate": 8.30921052631579e-06, "loss": 0.541, "step": 4120 }, { "epoch": 1.698190789473684, "grad_norm": 73.19493865966797, "learning_rate": 8.305098684210527e-06, "loss": 0.2733, "step": 4130 }, { "epoch": 1.7023026315789473, "grad_norm": 0.04348941892385483, "learning_rate": 8.300986842105264e-06, "loss": 0.4615, "step": 4140 }, { "epoch": 1.7064144736842106, "grad_norm": 0.1173626184463501, "learning_rate": 8.296875e-06, "loss": 0.4318, "step": 4150 }, { "epoch": 1.7105263157894737, "grad_norm": 4.591619968414307, "learning_rate": 8.292763157894738e-06, "loss": 0.203, "step": 4160 }, { "epoch": 1.7146381578947367, "grad_norm": 70.83255767822266, "learning_rate": 8.288651315789475e-06, "loss": 0.3747, "step": 4170 }, { "epoch": 1.71875, "grad_norm": 17.581378936767578, "learning_rate": 8.28453947368421e-06, "loss": 0.1498, "step": 4180 }, { "epoch": 1.7228618421052633, "grad_norm": 25.320514678955078, "learning_rate": 8.280427631578948e-06, "loss": 0.4238, "step": 4190 }, { "epoch": 1.7269736842105263, "grad_norm": 60.90825271606445, "learning_rate": 8.276315789473684e-06, "loss": 0.5959, "step": 4200 }, { "epoch": 1.7310855263157894, "grad_norm": 2.9256107807159424, "learning_rate": 8.272203947368421e-06, "loss": 0.5433, "step": 4210 }, { "epoch": 1.7351973684210527, "grad_norm": 0.1645122617483139, "learning_rate": 8.268092105263159e-06, "loss": 0.3483, "step": 4220 }, { "epoch": 1.739309210526316, "grad_norm": 0.030659016221761703, "learning_rate": 8.263980263157896e-06, "loss": 0.6285, "step": 4230 }, { "epoch": 1.743421052631579, "grad_norm": 67.81155395507812, "learning_rate": 8.259868421052632e-06, "loss": 1.1265, "step": 4240 }, { "epoch": 1.747532894736842, "grad_norm": 0.02041751518845558, "learning_rate": 8.25575657894737e-06, "loss": 0.3916, "step": 4250 }, { "epoch": 1.7516447368421053, "grad_norm": 13.793588638305664, "learning_rate": 8.251644736842105e-06, "loss": 0.2395, "step": 4260 }, { "epoch": 1.7557565789473686, "grad_norm": 11.824005126953125, "learning_rate": 8.247532894736843e-06, "loss": 0.3163, "step": 4270 }, { "epoch": 1.7598684210526314, "grad_norm": 6.396011829376221, "learning_rate": 8.24342105263158e-06, "loss": 0.3761, "step": 4280 }, { "epoch": 1.7639802631578947, "grad_norm": 23.77560806274414, "learning_rate": 8.239309210526316e-06, "loss": 0.632, "step": 4290 }, { "epoch": 1.768092105263158, "grad_norm": 10.43442440032959, "learning_rate": 8.235197368421053e-06, "loss": 0.2882, "step": 4300 }, { "epoch": 1.772203947368421, "grad_norm": 42.35690689086914, "learning_rate": 8.23108552631579e-06, "loss": 0.5975, "step": 4310 }, { "epoch": 1.776315789473684, "grad_norm": 1.9145148992538452, "learning_rate": 8.226973684210527e-06, "loss": 0.1458, "step": 4320 }, { "epoch": 1.7804276315789473, "grad_norm": 0.08790173381567001, "learning_rate": 8.222861842105264e-06, "loss": 0.5101, "step": 4330 }, { "epoch": 1.7845394736842106, "grad_norm": 31.585073471069336, "learning_rate": 8.218750000000002e-06, "loss": 0.7057, "step": 4340 }, { "epoch": 1.7886513157894737, "grad_norm": 1.391623616218567, "learning_rate": 8.214638157894737e-06, "loss": 0.1948, "step": 4350 }, { "epoch": 1.7927631578947367, "grad_norm": 50.8994026184082, "learning_rate": 8.210526315789475e-06, "loss": 0.2784, "step": 4360 }, { "epoch": 1.796875, "grad_norm": 61.71848678588867, "learning_rate": 8.20641447368421e-06, "loss": 0.6412, "step": 4370 }, { "epoch": 1.8009868421052633, "grad_norm": 3.166621208190918, "learning_rate": 8.202302631578948e-06, "loss": 0.484, "step": 4380 }, { "epoch": 1.8050986842105263, "grad_norm": 60.430580139160156, "learning_rate": 8.198190789473684e-06, "loss": 0.5307, "step": 4390 }, { "epoch": 1.8092105263157894, "grad_norm": 0.003818509867414832, "learning_rate": 8.194078947368421e-06, "loss": 0.0689, "step": 4400 }, { "epoch": 1.8133223684210527, "grad_norm": 28.11445426940918, "learning_rate": 8.189967105263159e-06, "loss": 0.5673, "step": 4410 }, { "epoch": 1.817434210526316, "grad_norm": 1.9147342443466187, "learning_rate": 8.185855263157896e-06, "loss": 0.3738, "step": 4420 }, { "epoch": 1.821546052631579, "grad_norm": 75.0318832397461, "learning_rate": 8.181743421052632e-06, "loss": 0.6623, "step": 4430 }, { "epoch": 1.825657894736842, "grad_norm": 0.37105000019073486, "learning_rate": 8.17763157894737e-06, "loss": 0.4921, "step": 4440 }, { "epoch": 1.8297697368421053, "grad_norm": 48.0621452331543, "learning_rate": 8.173519736842107e-06, "loss": 0.1337, "step": 4450 }, { "epoch": 1.8338815789473686, "grad_norm": 80.51526641845703, "learning_rate": 8.169407894736843e-06, "loss": 0.3979, "step": 4460 }, { "epoch": 1.8379934210526314, "grad_norm": 43.526065826416016, "learning_rate": 8.16529605263158e-06, "loss": 0.4044, "step": 4470 }, { "epoch": 1.8421052631578947, "grad_norm": 47.46805953979492, "learning_rate": 8.161184210526316e-06, "loss": 0.215, "step": 4480 }, { "epoch": 1.846217105263158, "grad_norm": 48.90501022338867, "learning_rate": 8.157072368421053e-06, "loss": 0.6912, "step": 4490 }, { "epoch": 1.850328947368421, "grad_norm": 9.779630661010742, "learning_rate": 8.152960526315789e-06, "loss": 0.4418, "step": 4500 }, { "epoch": 1.854440789473684, "grad_norm": 0.7416747212409973, "learning_rate": 8.148848684210528e-06, "loss": 0.678, "step": 4510 }, { "epoch": 1.8585526315789473, "grad_norm": 0.008086485788226128, "learning_rate": 8.144736842105264e-06, "loss": 0.6166, "step": 4520 }, { "epoch": 1.8626644736842106, "grad_norm": 4.0084967613220215, "learning_rate": 8.140625000000001e-06, "loss": 0.1721, "step": 4530 }, { "epoch": 1.8667763157894737, "grad_norm": 0.25842565298080444, "learning_rate": 8.136513157894737e-06, "loss": 0.0703, "step": 4540 }, { "epoch": 1.8708881578947367, "grad_norm": 27.43667221069336, "learning_rate": 8.132401315789475e-06, "loss": 0.3083, "step": 4550 }, { "epoch": 1.875, "grad_norm": 48.5318489074707, "learning_rate": 8.12828947368421e-06, "loss": 0.2443, "step": 4560 }, { "epoch": 1.8791118421052633, "grad_norm": 0.08080202341079712, "learning_rate": 8.124177631578948e-06, "loss": 0.6487, "step": 4570 }, { "epoch": 1.8832236842105263, "grad_norm": 0.3895185589790344, "learning_rate": 8.120065789473685e-06, "loss": 0.1034, "step": 4580 }, { "epoch": 1.8873355263157894, "grad_norm": 1.0280920267105103, "learning_rate": 8.115953947368421e-06, "loss": 0.307, "step": 4590 }, { "epoch": 1.8914473684210527, "grad_norm": 65.11549377441406, "learning_rate": 8.111842105263158e-06, "loss": 0.4791, "step": 4600 }, { "epoch": 1.895559210526316, "grad_norm": 74.55525970458984, "learning_rate": 8.107730263157896e-06, "loss": 0.6433, "step": 4610 }, { "epoch": 1.899671052631579, "grad_norm": 22.78346824645996, "learning_rate": 8.103618421052632e-06, "loss": 0.786, "step": 4620 }, { "epoch": 1.903782894736842, "grad_norm": 86.56382751464844, "learning_rate": 8.099506578947369e-06, "loss": 0.6106, "step": 4630 }, { "epoch": 1.9078947368421053, "grad_norm": 0.2657437324523926, "learning_rate": 8.095394736842107e-06, "loss": 0.6693, "step": 4640 }, { "epoch": 1.9120065789473686, "grad_norm": 0.009138388559222221, "learning_rate": 8.091282894736842e-06, "loss": 0.2074, "step": 4650 }, { "epoch": 1.9161184210526314, "grad_norm": 12.050437927246094, "learning_rate": 8.08717105263158e-06, "loss": 0.3525, "step": 4660 }, { "epoch": 1.9202302631578947, "grad_norm": 85.17498016357422, "learning_rate": 8.083059210526316e-06, "loss": 0.1985, "step": 4670 }, { "epoch": 1.924342105263158, "grad_norm": 0.09853484481573105, "learning_rate": 8.078947368421053e-06, "loss": 0.0842, "step": 4680 }, { "epoch": 1.928453947368421, "grad_norm": 58.58940124511719, "learning_rate": 8.074835526315789e-06, "loss": 0.556, "step": 4690 }, { "epoch": 1.932565789473684, "grad_norm": 0.3376722037792206, "learning_rate": 8.070723684210528e-06, "loss": 0.3301, "step": 4700 }, { "epoch": 1.9366776315789473, "grad_norm": 61.81422805786133, "learning_rate": 8.066611842105264e-06, "loss": 0.5592, "step": 4710 }, { "epoch": 1.9407894736842106, "grad_norm": 14.672579765319824, "learning_rate": 8.062500000000001e-06, "loss": 0.2208, "step": 4720 }, { "epoch": 1.9449013157894737, "grad_norm": 11.613296508789062, "learning_rate": 8.058388157894737e-06, "loss": 0.3177, "step": 4730 }, { "epoch": 1.9490131578947367, "grad_norm": 4.859875202178955, "learning_rate": 8.054276315789474e-06, "loss": 0.1121, "step": 4740 }, { "epoch": 1.953125, "grad_norm": 1.6204088926315308, "learning_rate": 8.050164473684212e-06, "loss": 0.3163, "step": 4750 }, { "epoch": 1.9572368421052633, "grad_norm": 22.201229095458984, "learning_rate": 8.046052631578948e-06, "loss": 0.4466, "step": 4760 }, { "epoch": 1.9613486842105263, "grad_norm": 28.351089477539062, "learning_rate": 8.041940789473685e-06, "loss": 0.1054, "step": 4770 }, { "epoch": 1.9654605263157894, "grad_norm": 47.56678771972656, "learning_rate": 8.03782894736842e-06, "loss": 0.2095, "step": 4780 }, { "epoch": 1.9695723684210527, "grad_norm": 3.671384572982788, "learning_rate": 8.033717105263158e-06, "loss": 0.638, "step": 4790 }, { "epoch": 1.973684210526316, "grad_norm": 0.8618096709251404, "learning_rate": 8.029605263157896e-06, "loss": 0.4759, "step": 4800 }, { "epoch": 1.977796052631579, "grad_norm": 0.04880933091044426, "learning_rate": 8.025493421052633e-06, "loss": 0.6423, "step": 4810 }, { "epoch": 1.981907894736842, "grad_norm": 0.03361703082919121, "learning_rate": 8.021381578947369e-06, "loss": 0.8028, "step": 4820 }, { "epoch": 1.9860197368421053, "grad_norm": 0.3052118718624115, "learning_rate": 8.017269736842106e-06, "loss": 0.0253, "step": 4830 }, { "epoch": 1.9901315789473686, "grad_norm": 0.035156674683094025, "learning_rate": 8.013157894736842e-06, "loss": 0.1967, "step": 4840 }, { "epoch": 1.9942434210526314, "grad_norm": 81.99966430664062, "learning_rate": 8.00904605263158e-06, "loss": 0.5797, "step": 4850 }, { "epoch": 1.9983552631578947, "grad_norm": 0.685602068901062, "learning_rate": 8.004934210526315e-06, "loss": 0.309, "step": 4860 }, { "epoch": 2.002467105263158, "grad_norm": 37.46296310424805, "learning_rate": 8.000822368421053e-06, "loss": 0.2657, "step": 4870 }, { "epoch": 2.0065789473684212, "grad_norm": 0.056410618126392365, "learning_rate": 7.99671052631579e-06, "loss": 0.3954, "step": 4880 }, { "epoch": 2.010690789473684, "grad_norm": 112.95884704589844, "learning_rate": 7.992598684210528e-06, "loss": 0.3842, "step": 4890 }, { "epoch": 2.0148026315789473, "grad_norm": 0.022623682394623756, "learning_rate": 7.988486842105263e-06, "loss": 0.5508, "step": 4900 }, { "epoch": 2.0189144736842106, "grad_norm": 58.53015899658203, "learning_rate": 7.984375000000001e-06, "loss": 0.2577, "step": 4910 }, { "epoch": 2.023026315789474, "grad_norm": 7.032954216003418, "learning_rate": 7.980263157894738e-06, "loss": 0.3766, "step": 4920 }, { "epoch": 2.0271381578947367, "grad_norm": 0.21705050766468048, "learning_rate": 7.976151315789474e-06, "loss": 0.1742, "step": 4930 }, { "epoch": 2.03125, "grad_norm": 0.012392381206154823, "learning_rate": 7.972039473684212e-06, "loss": 0.144, "step": 4940 }, { "epoch": 2.0353618421052633, "grad_norm": 46.255126953125, "learning_rate": 7.967927631578947e-06, "loss": 0.036, "step": 4950 }, { "epoch": 2.039473684210526, "grad_norm": 0.034974802285432816, "learning_rate": 7.963815789473685e-06, "loss": 0.0679, "step": 4960 }, { "epoch": 2.0435855263157894, "grad_norm": 0.04458899050951004, "learning_rate": 7.95970394736842e-06, "loss": 0.0958, "step": 4970 }, { "epoch": 2.0476973684210527, "grad_norm": 47.49546813964844, "learning_rate": 7.95559210526316e-06, "loss": 0.5122, "step": 4980 }, { "epoch": 2.051809210526316, "grad_norm": 33.729793548583984, "learning_rate": 7.951480263157896e-06, "loss": 0.6454, "step": 4990 }, { "epoch": 2.0559210526315788, "grad_norm": 12.397784233093262, "learning_rate": 7.947368421052633e-06, "loss": 0.0289, "step": 5000 }, { "epoch": 2.060032894736842, "grad_norm": 45.05794906616211, "learning_rate": 7.943256578947369e-06, "loss": 0.1298, "step": 5010 }, { "epoch": 2.0641447368421053, "grad_norm": 57.769283294677734, "learning_rate": 7.939144736842106e-06, "loss": 0.4246, "step": 5020 }, { "epoch": 2.0682565789473686, "grad_norm": 28.973388671875, "learning_rate": 7.935032894736842e-06, "loss": 0.2389, "step": 5030 }, { "epoch": 2.0723684210526314, "grad_norm": 56.98832321166992, "learning_rate": 7.93092105263158e-06, "loss": 0.1893, "step": 5040 }, { "epoch": 2.0764802631578947, "grad_norm": 56.96243667602539, "learning_rate": 7.926809210526317e-06, "loss": 0.2976, "step": 5050 }, { "epoch": 2.080592105263158, "grad_norm": 55.864070892333984, "learning_rate": 7.922697368421053e-06, "loss": 0.4408, "step": 5060 }, { "epoch": 2.0847039473684212, "grad_norm": 1.9658788442611694, "learning_rate": 7.91858552631579e-06, "loss": 0.1167, "step": 5070 }, { "epoch": 2.088815789473684, "grad_norm": 1.0686661005020142, "learning_rate": 7.914473684210528e-06, "loss": 0.1516, "step": 5080 }, { "epoch": 2.0929276315789473, "grad_norm": 2.143155813217163, "learning_rate": 7.910361842105265e-06, "loss": 0.1635, "step": 5090 }, { "epoch": 2.0970394736842106, "grad_norm": 52.307498931884766, "learning_rate": 7.90625e-06, "loss": 0.2435, "step": 5100 }, { "epoch": 2.101151315789474, "grad_norm": 2.929649829864502, "learning_rate": 7.902138157894738e-06, "loss": 0.327, "step": 5110 }, { "epoch": 2.1052631578947367, "grad_norm": 3.452099561691284, "learning_rate": 7.898026315789474e-06, "loss": 0.2144, "step": 5120 }, { "epoch": 2.109375, "grad_norm": 18.331693649291992, "learning_rate": 7.893914473684211e-06, "loss": 0.1962, "step": 5130 }, { "epoch": 2.1134868421052633, "grad_norm": 6.536662578582764, "learning_rate": 7.889802631578947e-06, "loss": 0.0614, "step": 5140 }, { "epoch": 2.117598684210526, "grad_norm": 0.015528288669884205, "learning_rate": 7.885690789473685e-06, "loss": 0.218, "step": 5150 }, { "epoch": 2.1217105263157894, "grad_norm": 0.06858813762664795, "learning_rate": 7.88157894736842e-06, "loss": 0.3551, "step": 5160 }, { "epoch": 2.1258223684210527, "grad_norm": 2.6634881496429443, "learning_rate": 7.87746710526316e-06, "loss": 0.2286, "step": 5170 }, { "epoch": 2.129934210526316, "grad_norm": 0.09859636425971985, "learning_rate": 7.873355263157895e-06, "loss": 0.6348, "step": 5180 }, { "epoch": 2.1340460526315788, "grad_norm": 0.701995313167572, "learning_rate": 7.869243421052633e-06, "loss": 0.1765, "step": 5190 }, { "epoch": 2.138157894736842, "grad_norm": 0.037273943424224854, "learning_rate": 7.865131578947369e-06, "loss": 0.1263, "step": 5200 }, { "epoch": 2.1422697368421053, "grad_norm": 0.040880296379327774, "learning_rate": 7.861019736842106e-06, "loss": 0.1073, "step": 5210 }, { "epoch": 2.1463815789473686, "grad_norm": 77.20631408691406, "learning_rate": 7.856907894736843e-06, "loss": 0.4358, "step": 5220 }, { "epoch": 2.1504934210526314, "grad_norm": 3.0592525005340576, "learning_rate": 7.85279605263158e-06, "loss": 0.3378, "step": 5230 }, { "epoch": 2.1546052631578947, "grad_norm": 2.437439441680908, "learning_rate": 7.848684210526317e-06, "loss": 0.3702, "step": 5240 }, { "epoch": 2.158717105263158, "grad_norm": 0.14886601269245148, "learning_rate": 7.844572368421052e-06, "loss": 0.1371, "step": 5250 }, { "epoch": 2.1628289473684212, "grad_norm": 45.28163146972656, "learning_rate": 7.84046052631579e-06, "loss": 0.3802, "step": 5260 }, { "epoch": 2.166940789473684, "grad_norm": 72.54158782958984, "learning_rate": 7.836348684210527e-06, "loss": 0.4916, "step": 5270 }, { "epoch": 2.1710526315789473, "grad_norm": 2.2983322143554688, "learning_rate": 7.832236842105265e-06, "loss": 0.1063, "step": 5280 }, { "epoch": 2.1751644736842106, "grad_norm": 41.453407287597656, "learning_rate": 7.828125e-06, "loss": 0.8019, "step": 5290 }, { "epoch": 2.1792763157894735, "grad_norm": 20.001136779785156, "learning_rate": 7.824013157894738e-06, "loss": 0.0648, "step": 5300 }, { "epoch": 2.1833881578947367, "grad_norm": 0.0267824474722147, "learning_rate": 7.819901315789474e-06, "loss": 0.2172, "step": 5310 }, { "epoch": 2.1875, "grad_norm": 79.87635803222656, "learning_rate": 7.815789473684211e-06, "loss": 0.9097, "step": 5320 }, { "epoch": 2.1916118421052633, "grad_norm": 0.06062754988670349, "learning_rate": 7.811677631578947e-06, "loss": 0.4808, "step": 5330 }, { "epoch": 2.1957236842105265, "grad_norm": 0.18611417710781097, "learning_rate": 7.807565789473684e-06, "loss": 0.3803, "step": 5340 }, { "epoch": 2.1998355263157894, "grad_norm": 0.8481597900390625, "learning_rate": 7.803453947368422e-06, "loss": 0.3331, "step": 5350 }, { "epoch": 2.2039473684210527, "grad_norm": 14.719058990478516, "learning_rate": 7.79934210526316e-06, "loss": 0.4904, "step": 5360 }, { "epoch": 2.208059210526316, "grad_norm": 12.377104759216309, "learning_rate": 7.795230263157895e-06, "loss": 0.1892, "step": 5370 }, { "epoch": 2.2121710526315788, "grad_norm": 16.943613052368164, "learning_rate": 7.791118421052633e-06, "loss": 0.2359, "step": 5380 }, { "epoch": 2.216282894736842, "grad_norm": 77.85987091064453, "learning_rate": 7.78700657894737e-06, "loss": 0.772, "step": 5390 }, { "epoch": 2.2203947368421053, "grad_norm": 66.15851593017578, "learning_rate": 7.782894736842106e-06, "loss": 0.1909, "step": 5400 }, { "epoch": 2.2245065789473686, "grad_norm": 67.50369262695312, "learning_rate": 7.778782894736843e-06, "loss": 0.192, "step": 5410 }, { "epoch": 2.2286184210526314, "grad_norm": 0.5703405141830444, "learning_rate": 7.774671052631579e-06, "loss": 0.0897, "step": 5420 }, { "epoch": 2.2327302631578947, "grad_norm": 52.4270133972168, "learning_rate": 7.770559210526316e-06, "loss": 0.0543, "step": 5430 }, { "epoch": 2.236842105263158, "grad_norm": 77.67536163330078, "learning_rate": 7.766447368421052e-06, "loss": 0.3614, "step": 5440 }, { "epoch": 2.2409539473684212, "grad_norm": 61.79861831665039, "learning_rate": 7.76233552631579e-06, "loss": 0.0899, "step": 5450 }, { "epoch": 2.245065789473684, "grad_norm": 0.18140049278736115, "learning_rate": 7.758223684210527e-06, "loss": 0.411, "step": 5460 }, { "epoch": 2.2491776315789473, "grad_norm": 37.416351318359375, "learning_rate": 7.754111842105265e-06, "loss": 0.1519, "step": 5470 }, { "epoch": 2.2532894736842106, "grad_norm": 0.24811792373657227, "learning_rate": 7.75e-06, "loss": 0.049, "step": 5480 }, { "epoch": 2.2574013157894735, "grad_norm": 0.09098932147026062, "learning_rate": 7.745888157894738e-06, "loss": 0.1216, "step": 5490 }, { "epoch": 2.2615131578947367, "grad_norm": 0.39087772369384766, "learning_rate": 7.741776315789474e-06, "loss": 0.284, "step": 5500 }, { "epoch": 2.265625, "grad_norm": 80.84004974365234, "learning_rate": 7.737664473684211e-06, "loss": 0.2081, "step": 5510 }, { "epoch": 2.2697368421052633, "grad_norm": 0.9717281460762024, "learning_rate": 7.733552631578948e-06, "loss": 0.3674, "step": 5520 }, { "epoch": 2.2738486842105265, "grad_norm": 0.6159382462501526, "learning_rate": 7.729440789473684e-06, "loss": 0.1791, "step": 5530 }, { "epoch": 2.2779605263157894, "grad_norm": 0.013826197013258934, "learning_rate": 7.725328947368422e-06, "loss": 0.0343, "step": 5540 }, { "epoch": 2.2820723684210527, "grad_norm": 78.19012451171875, "learning_rate": 7.721217105263159e-06, "loss": 0.5983, "step": 5550 }, { "epoch": 2.286184210526316, "grad_norm": 0.009286273270845413, "learning_rate": 7.717105263157897e-06, "loss": 0.1152, "step": 5560 }, { "epoch": 2.2902960526315788, "grad_norm": 0.7038535475730896, "learning_rate": 7.712993421052632e-06, "loss": 0.0342, "step": 5570 }, { "epoch": 2.294407894736842, "grad_norm": 16.376056671142578, "learning_rate": 7.70888157894737e-06, "loss": 0.2536, "step": 5580 }, { "epoch": 2.2985197368421053, "grad_norm": 75.77314758300781, "learning_rate": 7.704769736842106e-06, "loss": 0.967, "step": 5590 }, { "epoch": 2.3026315789473686, "grad_norm": 0.04217597842216492, "learning_rate": 7.700657894736843e-06, "loss": 0.0899, "step": 5600 }, { "epoch": 2.3067434210526314, "grad_norm": 0.0018036451656371355, "learning_rate": 7.696546052631579e-06, "loss": 0.0971, "step": 5610 }, { "epoch": 2.3108552631578947, "grad_norm": 64.49011993408203, "learning_rate": 7.692434210526316e-06, "loss": 0.5319, "step": 5620 }, { "epoch": 2.314967105263158, "grad_norm": 6.954440593719482, "learning_rate": 7.688322368421052e-06, "loss": 0.4004, "step": 5630 }, { "epoch": 2.3190789473684212, "grad_norm": 83.60736083984375, "learning_rate": 7.68421052631579e-06, "loss": 0.2637, "step": 5640 }, { "epoch": 2.323190789473684, "grad_norm": 0.47764426469802856, "learning_rate": 7.680098684210527e-06, "loss": 0.2358, "step": 5650 }, { "epoch": 2.3273026315789473, "grad_norm": 0.11332187056541443, "learning_rate": 7.675986842105264e-06, "loss": 0.1717, "step": 5660 }, { "epoch": 2.3314144736842106, "grad_norm": 21.913246154785156, "learning_rate": 7.671875e-06, "loss": 0.3004, "step": 5670 }, { "epoch": 2.3355263157894735, "grad_norm": 0.553477942943573, "learning_rate": 7.667763157894738e-06, "loss": 0.1921, "step": 5680 }, { "epoch": 2.3396381578947367, "grad_norm": 94.19501495361328, "learning_rate": 7.663651315789475e-06, "loss": 0.2609, "step": 5690 }, { "epoch": 2.34375, "grad_norm": 21.826440811157227, "learning_rate": 7.65953947368421e-06, "loss": 0.0104, "step": 5700 }, { "epoch": 2.3478618421052633, "grad_norm": 4.665867328643799, "learning_rate": 7.655427631578948e-06, "loss": 0.3106, "step": 5710 }, { "epoch": 2.3519736842105265, "grad_norm": 3.169865846633911, "learning_rate": 7.651315789473684e-06, "loss": 0.2123, "step": 5720 }, { "epoch": 2.3560855263157894, "grad_norm": 64.12752532958984, "learning_rate": 7.647203947368421e-06, "loss": 0.101, "step": 5730 }, { "epoch": 2.3601973684210527, "grad_norm": 0.16324739158153534, "learning_rate": 7.643092105263159e-06, "loss": 0.2564, "step": 5740 }, { "epoch": 2.364309210526316, "grad_norm": 0.007978091016411781, "learning_rate": 7.638980263157896e-06, "loss": 0.0914, "step": 5750 }, { "epoch": 2.3684210526315788, "grad_norm": 6.777188301086426, "learning_rate": 7.634868421052632e-06, "loss": 0.123, "step": 5760 }, { "epoch": 2.372532894736842, "grad_norm": 21.03947639465332, "learning_rate": 7.63075657894737e-06, "loss": 0.0371, "step": 5770 }, { "epoch": 2.3766447368421053, "grad_norm": 0.5910683274269104, "learning_rate": 7.626644736842106e-06, "loss": 0.0277, "step": 5780 }, { "epoch": 2.3807565789473686, "grad_norm": 5.500244140625, "learning_rate": 7.622532894736843e-06, "loss": 0.3893, "step": 5790 }, { "epoch": 2.3848684210526314, "grad_norm": 60.288055419921875, "learning_rate": 7.6184210526315794e-06, "loss": 0.668, "step": 5800 }, { "epoch": 2.3889802631578947, "grad_norm": 0.009432058781385422, "learning_rate": 7.614309210526316e-06, "loss": 0.0354, "step": 5810 }, { "epoch": 2.393092105263158, "grad_norm": 0.014949236996471882, "learning_rate": 7.610197368421053e-06, "loss": 0.1037, "step": 5820 }, { "epoch": 2.3972039473684212, "grad_norm": 2.148726224899292, "learning_rate": 7.606085526315789e-06, "loss": 0.5846, "step": 5830 }, { "epoch": 2.401315789473684, "grad_norm": 2.4273200035095215, "learning_rate": 7.6019736842105276e-06, "loss": 0.0866, "step": 5840 }, { "epoch": 2.4054276315789473, "grad_norm": 70.4505844116211, "learning_rate": 7.597861842105264e-06, "loss": 0.3915, "step": 5850 }, { "epoch": 2.4095394736842106, "grad_norm": 60.22188186645508, "learning_rate": 7.593750000000001e-06, "loss": 0.5337, "step": 5860 }, { "epoch": 2.4136513157894735, "grad_norm": 8.238316535949707, "learning_rate": 7.589638157894737e-06, "loss": 0.0111, "step": 5870 }, { "epoch": 2.4177631578947367, "grad_norm": 14.266987800598145, "learning_rate": 7.585526315789474e-06, "loss": 0.1612, "step": 5880 }, { "epoch": 2.421875, "grad_norm": 0.43882203102111816, "learning_rate": 7.581414473684211e-06, "loss": 0.3086, "step": 5890 }, { "epoch": 2.4259868421052633, "grad_norm": 0.2558574974536896, "learning_rate": 7.577302631578948e-06, "loss": 0.3342, "step": 5900 }, { "epoch": 2.4300986842105265, "grad_norm": 0.09452182054519653, "learning_rate": 7.573190789473685e-06, "loss": 0.2291, "step": 5910 }, { "epoch": 2.4342105263157894, "grad_norm": 0.0014281157637014985, "learning_rate": 7.569078947368421e-06, "loss": 0.0711, "step": 5920 }, { "epoch": 2.4383223684210527, "grad_norm": 37.98161697387695, "learning_rate": 7.564967105263159e-06, "loss": 0.3051, "step": 5930 }, { "epoch": 2.442434210526316, "grad_norm": 3.323979139328003, "learning_rate": 7.560855263157895e-06, "loss": 0.0245, "step": 5940 }, { "epoch": 2.4465460526315788, "grad_norm": 1.3332736492156982, "learning_rate": 7.556743421052633e-06, "loss": 0.3637, "step": 5950 }, { "epoch": 2.450657894736842, "grad_norm": 2.7701385021209717, "learning_rate": 7.552631578947369e-06, "loss": 0.2454, "step": 5960 }, { "epoch": 2.4547697368421053, "grad_norm": 66.12533569335938, "learning_rate": 7.548519736842106e-06, "loss": 0.4883, "step": 5970 }, { "epoch": 2.4588815789473686, "grad_norm": 0.015920128673315048, "learning_rate": 7.544407894736843e-06, "loss": 0.2759, "step": 5980 }, { "epoch": 2.4629934210526314, "grad_norm": 0.8414337635040283, "learning_rate": 7.540296052631579e-06, "loss": 0.7129, "step": 5990 }, { "epoch": 2.4671052631578947, "grad_norm": 14.422956466674805, "learning_rate": 7.536184210526316e-06, "loss": 0.3284, "step": 6000 }, { "epoch": 2.471217105263158, "grad_norm": 0.07774366438388824, "learning_rate": 7.5320723684210524e-06, "loss": 0.4476, "step": 6010 }, { "epoch": 2.4753289473684212, "grad_norm": 0.7951784133911133, "learning_rate": 7.52796052631579e-06, "loss": 0.3266, "step": 6020 }, { "epoch": 2.479440789473684, "grad_norm": 17.519073486328125, "learning_rate": 7.523848684210527e-06, "loss": 0.2493, "step": 6030 }, { "epoch": 2.4835526315789473, "grad_norm": 10.533546447753906, "learning_rate": 7.519736842105264e-06, "loss": 0.2244, "step": 6040 }, { "epoch": 2.4876644736842106, "grad_norm": 0.004280813504010439, "learning_rate": 7.5156250000000006e-06, "loss": 0.2085, "step": 6050 }, { "epoch": 2.4917763157894735, "grad_norm": 5.5901970863342285, "learning_rate": 7.511513157894737e-06, "loss": 0.0381, "step": 6060 }, { "epoch": 2.4958881578947367, "grad_norm": 72.65666961669922, "learning_rate": 7.507401315789475e-06, "loss": 0.1484, "step": 6070 }, { "epoch": 2.5, "grad_norm": 0.09732012450695038, "learning_rate": 7.503289473684211e-06, "loss": 0.1253, "step": 6080 }, { "epoch": 2.5041118421052633, "grad_norm": 1.242153525352478, "learning_rate": 7.499177631578948e-06, "loss": 0.4389, "step": 6090 }, { "epoch": 2.5082236842105265, "grad_norm": 0.5862547159194946, "learning_rate": 7.4950657894736845e-06, "loss": 0.0854, "step": 6100 }, { "epoch": 2.5123355263157894, "grad_norm": 10.629526138305664, "learning_rate": 7.490953947368421e-06, "loss": 0.0749, "step": 6110 }, { "epoch": 2.5164473684210527, "grad_norm": 0.005628309678286314, "learning_rate": 7.486842105263159e-06, "loss": 0.2196, "step": 6120 }, { "epoch": 2.520559210526316, "grad_norm": 0.13058516383171082, "learning_rate": 7.482730263157896e-06, "loss": 0.1543, "step": 6130 }, { "epoch": 2.5246710526315788, "grad_norm": 0.01582922600209713, "learning_rate": 7.478618421052633e-06, "loss": 0.1814, "step": 6140 }, { "epoch": 2.528782894736842, "grad_norm": 0.9425368309020996, "learning_rate": 7.474506578947369e-06, "loss": 0.4506, "step": 6150 }, { "epoch": 2.5328947368421053, "grad_norm": 0.3503407835960388, "learning_rate": 7.470394736842106e-06, "loss": 0.2199, "step": 6160 }, { "epoch": 2.5370065789473686, "grad_norm": 27.474987030029297, "learning_rate": 7.466282894736842e-06, "loss": 0.2349, "step": 6170 }, { "epoch": 2.5411184210526314, "grad_norm": 91.42644500732422, "learning_rate": 7.462171052631579e-06, "loss": 0.8179, "step": 6180 }, { "epoch": 2.5452302631578947, "grad_norm": 66.36519622802734, "learning_rate": 7.4580592105263165e-06, "loss": 0.547, "step": 6190 }, { "epoch": 2.549342105263158, "grad_norm": 92.01309204101562, "learning_rate": 7.453947368421053e-06, "loss": 0.2581, "step": 6200 }, { "epoch": 2.5534539473684212, "grad_norm": 90.81409454345703, "learning_rate": 7.44983552631579e-06, "loss": 0.7715, "step": 6210 }, { "epoch": 2.557565789473684, "grad_norm": 70.20539855957031, "learning_rate": 7.445723684210527e-06, "loss": 0.1391, "step": 6220 }, { "epoch": 2.5616776315789473, "grad_norm": 7.48537015914917, "learning_rate": 7.441611842105264e-06, "loss": 0.7486, "step": 6230 }, { "epoch": 2.5657894736842106, "grad_norm": 0.0066050267778337, "learning_rate": 7.437500000000001e-06, "loss": 0.6291, "step": 6240 }, { "epoch": 2.5699013157894735, "grad_norm": 0.050669994205236435, "learning_rate": 7.433388157894738e-06, "loss": 0.4743, "step": 6250 }, { "epoch": 2.5740131578947367, "grad_norm": 0.9918844699859619, "learning_rate": 7.4292763157894744e-06, "loss": 0.4746, "step": 6260 }, { "epoch": 2.578125, "grad_norm": 69.81021118164062, "learning_rate": 7.425164473684211e-06, "loss": 0.283, "step": 6270 }, { "epoch": 2.5822368421052633, "grad_norm": 0.3285808265209198, "learning_rate": 7.421052631578948e-06, "loss": 0.575, "step": 6280 }, { "epoch": 2.5863486842105265, "grad_norm": 0.7911439538002014, "learning_rate": 7.416940789473684e-06, "loss": 0.2961, "step": 6290 }, { "epoch": 2.5904605263157894, "grad_norm": 8.023531913757324, "learning_rate": 7.412828947368421e-06, "loss": 0.1326, "step": 6300 }, { "epoch": 2.5945723684210527, "grad_norm": 0.36695143580436707, "learning_rate": 7.408717105263159e-06, "loss": 0.1047, "step": 6310 }, { "epoch": 2.598684210526316, "grad_norm": 4.745920658111572, "learning_rate": 7.404605263157896e-06, "loss": 0.3415, "step": 6320 }, { "epoch": 2.6027960526315788, "grad_norm": 14.169811248779297, "learning_rate": 7.400493421052632e-06, "loss": 0.481, "step": 6330 }, { "epoch": 2.606907894736842, "grad_norm": 37.04063034057617, "learning_rate": 7.396381578947369e-06, "loss": 0.2138, "step": 6340 }, { "epoch": 2.6110197368421053, "grad_norm": 72.94132995605469, "learning_rate": 7.392269736842106e-06, "loss": 0.5681, "step": 6350 }, { "epoch": 2.6151315789473686, "grad_norm": 0.2758810520172119, "learning_rate": 7.388157894736843e-06, "loss": 0.5019, "step": 6360 }, { "epoch": 2.6192434210526314, "grad_norm": 22.47516632080078, "learning_rate": 7.38404605263158e-06, "loss": 0.1939, "step": 6370 }, { "epoch": 2.6233552631578947, "grad_norm": 0.08755365759134293, "learning_rate": 7.379934210526316e-06, "loss": 0.2396, "step": 6380 }, { "epoch": 2.627467105263158, "grad_norm": 1.0105468034744263, "learning_rate": 7.375822368421053e-06, "loss": 0.0383, "step": 6390 }, { "epoch": 2.6315789473684212, "grad_norm": 76.56537628173828, "learning_rate": 7.3717105263157895e-06, "loss": 0.2995, "step": 6400 }, { "epoch": 2.635690789473684, "grad_norm": 3.7688522338867188, "learning_rate": 7.367598684210528e-06, "loss": 0.1928, "step": 6410 }, { "epoch": 2.6398026315789473, "grad_norm": 4.840407848358154, "learning_rate": 7.363486842105264e-06, "loss": 0.5117, "step": 6420 }, { "epoch": 2.6439144736842106, "grad_norm": 0.04940143600106239, "learning_rate": 7.359375000000001e-06, "loss": 0.0273, "step": 6430 }, { "epoch": 2.6480263157894735, "grad_norm": 1.0362895727157593, "learning_rate": 7.355263157894738e-06, "loss": 0.7687, "step": 6440 }, { "epoch": 2.6521381578947367, "grad_norm": 0.010492905043065548, "learning_rate": 7.351151315789474e-06, "loss": 0.2144, "step": 6450 }, { "epoch": 2.65625, "grad_norm": 73.12825012207031, "learning_rate": 7.347039473684211e-06, "loss": 0.2823, "step": 6460 }, { "epoch": 2.6603618421052633, "grad_norm": 0.42096540331840515, "learning_rate": 7.3429276315789474e-06, "loss": 0.3534, "step": 6470 }, { "epoch": 2.6644736842105265, "grad_norm": 0.14485445618629456, "learning_rate": 7.338815789473684e-06, "loss": 0.3453, "step": 6480 }, { "epoch": 2.6685855263157894, "grad_norm": 12.482208251953125, "learning_rate": 7.3347039473684215e-06, "loss": 0.0713, "step": 6490 }, { "epoch": 2.6726973684210527, "grad_norm": 0.2503849267959595, "learning_rate": 7.330592105263159e-06, "loss": 0.2003, "step": 6500 }, { "epoch": 2.676809210526316, "grad_norm": 80.90233612060547, "learning_rate": 7.3264802631578956e-06, "loss": 0.6061, "step": 6510 }, { "epoch": 2.6809210526315788, "grad_norm": 0.1269141584634781, "learning_rate": 7.322368421052632e-06, "loss": 0.1411, "step": 6520 }, { "epoch": 2.685032894736842, "grad_norm": 0.40265151858329773, "learning_rate": 7.318256578947369e-06, "loss": 0.3083, "step": 6530 }, { "epoch": 2.6891447368421053, "grad_norm": 0.008912616409361362, "learning_rate": 7.314144736842106e-06, "loss": 0.2767, "step": 6540 }, { "epoch": 2.6932565789473686, "grad_norm": 100.50891876220703, "learning_rate": 7.310032894736843e-06, "loss": 0.5464, "step": 6550 }, { "epoch": 2.6973684210526314, "grad_norm": 1.5742254257202148, "learning_rate": 7.3059210526315795e-06, "loss": 0.2836, "step": 6560 }, { "epoch": 2.7014802631578947, "grad_norm": 40.927833557128906, "learning_rate": 7.301809210526316e-06, "loss": 0.4598, "step": 6570 }, { "epoch": 2.705592105263158, "grad_norm": 31.99708366394043, "learning_rate": 7.297697368421053e-06, "loss": 0.701, "step": 6580 }, { "epoch": 2.7097039473684212, "grad_norm": 154.178955078125, "learning_rate": 7.293585526315789e-06, "loss": 0.3698, "step": 6590 }, { "epoch": 2.713815789473684, "grad_norm": 0.013821637257933617, "learning_rate": 7.289473684210528e-06, "loss": 0.0203, "step": 6600 }, { "epoch": 2.7179276315789473, "grad_norm": 7.3465352058410645, "learning_rate": 7.285361842105264e-06, "loss": 0.1565, "step": 6610 }, { "epoch": 2.7220394736842106, "grad_norm": 22.534740447998047, "learning_rate": 7.281250000000001e-06, "loss": 0.0484, "step": 6620 }, { "epoch": 2.7261513157894735, "grad_norm": 123.59476470947266, "learning_rate": 7.277138157894737e-06, "loss": 0.3089, "step": 6630 }, { "epoch": 2.7302631578947367, "grad_norm": 0.18893316388130188, "learning_rate": 7.273026315789474e-06, "loss": 0.1738, "step": 6640 }, { "epoch": 2.734375, "grad_norm": 0.03182161599397659, "learning_rate": 7.268914473684211e-06, "loss": 0.1138, "step": 6650 }, { "epoch": 2.7384868421052633, "grad_norm": 14.257698059082031, "learning_rate": 7.264802631578948e-06, "loss": 0.0437, "step": 6660 }, { "epoch": 2.7425986842105265, "grad_norm": 0.24907447397708893, "learning_rate": 7.260690789473685e-06, "loss": 0.1522, "step": 6670 }, { "epoch": 2.7467105263157894, "grad_norm": 0.19256246089935303, "learning_rate": 7.256578947368421e-06, "loss": 0.0405, "step": 6680 }, { "epoch": 2.7508223684210527, "grad_norm": 13.120503425598145, "learning_rate": 7.252467105263159e-06, "loss": 0.3291, "step": 6690 }, { "epoch": 2.754934210526316, "grad_norm": 92.6781234741211, "learning_rate": 7.248355263157895e-06, "loss": 0.3699, "step": 6700 }, { "epoch": 2.7590460526315788, "grad_norm": 1.2149583101272583, "learning_rate": 7.244243421052633e-06, "loss": 0.1675, "step": 6710 }, { "epoch": 2.763157894736842, "grad_norm": 45.1708869934082, "learning_rate": 7.2401315789473694e-06, "loss": 0.7265, "step": 6720 }, { "epoch": 2.7672697368421053, "grad_norm": 0.01486230455338955, "learning_rate": 7.236019736842106e-06, "loss": 0.3647, "step": 6730 }, { "epoch": 2.7713815789473686, "grad_norm": 4.854966640472412, "learning_rate": 7.231907894736843e-06, "loss": 0.0995, "step": 6740 }, { "epoch": 2.7754934210526314, "grad_norm": 4.642969608306885, "learning_rate": 7.227796052631579e-06, "loss": 0.285, "step": 6750 }, { "epoch": 2.7796052631578947, "grad_norm": 45.799072265625, "learning_rate": 7.223684210526316e-06, "loss": 0.4681, "step": 6760 }, { "epoch": 2.783717105263158, "grad_norm": 62.760704040527344, "learning_rate": 7.2195723684210525e-06, "loss": 0.0934, "step": 6770 }, { "epoch": 2.7878289473684212, "grad_norm": 1.0289639234542847, "learning_rate": 7.21546052631579e-06, "loss": 0.1929, "step": 6780 }, { "epoch": 2.791940789473684, "grad_norm": 75.18840026855469, "learning_rate": 7.211348684210527e-06, "loss": 0.218, "step": 6790 }, { "epoch": 2.7960526315789473, "grad_norm": 0.4156988561153412, "learning_rate": 7.207236842105264e-06, "loss": 0.6963, "step": 6800 }, { "epoch": 2.8001644736842106, "grad_norm": 34.15106201171875, "learning_rate": 7.203125000000001e-06, "loss": 0.2267, "step": 6810 }, { "epoch": 2.8042763157894735, "grad_norm": 18.53900909423828, "learning_rate": 7.199013157894737e-06, "loss": 0.1175, "step": 6820 }, { "epoch": 2.8083881578947367, "grad_norm": 73.15071105957031, "learning_rate": 7.194901315789475e-06, "loss": 0.2903, "step": 6830 }, { "epoch": 2.8125, "grad_norm": 93.10474395751953, "learning_rate": 7.190789473684211e-06, "loss": 0.4457, "step": 6840 }, { "epoch": 2.8166118421052633, "grad_norm": 0.07544347643852234, "learning_rate": 7.186677631578948e-06, "loss": 0.5648, "step": 6850 }, { "epoch": 2.8207236842105265, "grad_norm": 0.11148311197757721, "learning_rate": 7.1825657894736845e-06, "loss": 0.31, "step": 6860 }, { "epoch": 2.8248355263157894, "grad_norm": 1.118446707725525, "learning_rate": 7.178453947368421e-06, "loss": 0.7102, "step": 6870 }, { "epoch": 2.8289473684210527, "grad_norm": 72.61494445800781, "learning_rate": 7.174342105263159e-06, "loss": 0.1706, "step": 6880 }, { "epoch": 2.833059210526316, "grad_norm": 0.8325988054275513, "learning_rate": 7.170230263157896e-06, "loss": 0.2599, "step": 6890 }, { "epoch": 2.8371710526315788, "grad_norm": 13.365738868713379, "learning_rate": 7.166118421052633e-06, "loss": 0.1346, "step": 6900 }, { "epoch": 2.841282894736842, "grad_norm": 2.8599555492401123, "learning_rate": 7.162006578947369e-06, "loss": 0.0275, "step": 6910 }, { "epoch": 2.8453947368421053, "grad_norm": 0.002780698239803314, "learning_rate": 7.157894736842106e-06, "loss": 0.1408, "step": 6920 }, { "epoch": 2.8495065789473686, "grad_norm": 1.336742877960205, "learning_rate": 7.1537828947368424e-06, "loss": 0.2551, "step": 6930 }, { "epoch": 2.8536184210526314, "grad_norm": 31.96080780029297, "learning_rate": 7.149671052631579e-06, "loss": 0.3136, "step": 6940 }, { "epoch": 2.8577302631578947, "grad_norm": 0.01327523123472929, "learning_rate": 7.145559210526316e-06, "loss": 0.2752, "step": 6950 }, { "epoch": 2.861842105263158, "grad_norm": 3.886087656021118, "learning_rate": 7.141447368421053e-06, "loss": 0.1399, "step": 6960 }, { "epoch": 2.8659539473684212, "grad_norm": 0.03213124722242355, "learning_rate": 7.13733552631579e-06, "loss": 0.002, "step": 6970 }, { "epoch": 2.870065789473684, "grad_norm": 0.41033652424812317, "learning_rate": 7.133223684210527e-06, "loss": 0.2465, "step": 6980 }, { "epoch": 2.8741776315789473, "grad_norm": 0.006523555144667625, "learning_rate": 7.129111842105264e-06, "loss": 0.2667, "step": 6990 }, { "epoch": 2.8782894736842106, "grad_norm": 0.21593809127807617, "learning_rate": 7.125e-06, "loss": 0.1531, "step": 7000 }, { "epoch": 2.8824013157894735, "grad_norm": 0.4313972294330597, "learning_rate": 7.120888157894738e-06, "loss": 0.2449, "step": 7010 }, { "epoch": 2.8865131578947367, "grad_norm": 0.06804917752742767, "learning_rate": 7.1167763157894745e-06, "loss": 0.4352, "step": 7020 }, { "epoch": 2.890625, "grad_norm": 47.29886245727539, "learning_rate": 7.112664473684211e-06, "loss": 0.1663, "step": 7030 }, { "epoch": 2.8947368421052633, "grad_norm": 0.2415972501039505, "learning_rate": 7.108552631578948e-06, "loss": 0.2693, "step": 7040 }, { "epoch": 2.8988486842105265, "grad_norm": 51.08794403076172, "learning_rate": 7.104440789473684e-06, "loss": 0.0981, "step": 7050 }, { "epoch": 2.9029605263157894, "grad_norm": 2.805889129638672, "learning_rate": 7.100328947368421e-06, "loss": 0.069, "step": 7060 }, { "epoch": 2.9070723684210527, "grad_norm": 0.1910848468542099, "learning_rate": 7.096217105263159e-06, "loss": 0.4011, "step": 7070 }, { "epoch": 2.911184210526316, "grad_norm": 18.64550018310547, "learning_rate": 7.092105263157896e-06, "loss": 0.0614, "step": 7080 }, { "epoch": 2.9152960526315788, "grad_norm": 0.04789988696575165, "learning_rate": 7.087993421052632e-06, "loss": 0.0657, "step": 7090 }, { "epoch": 2.919407894736842, "grad_norm": 6.012223243713379, "learning_rate": 7.083881578947369e-06, "loss": 0.1898, "step": 7100 }, { "epoch": 2.9235197368421053, "grad_norm": 14.203520774841309, "learning_rate": 7.079769736842106e-06, "loss": 0.1308, "step": 7110 }, { "epoch": 2.9276315789473686, "grad_norm": 88.35704803466797, "learning_rate": 7.075657894736842e-06, "loss": 0.4129, "step": 7120 }, { "epoch": 2.9317434210526314, "grad_norm": 0.012460517697036266, "learning_rate": 7.07154605263158e-06, "loss": 0.2988, "step": 7130 }, { "epoch": 2.9358552631578947, "grad_norm": 9.11884593963623, "learning_rate": 7.067434210526316e-06, "loss": 0.1225, "step": 7140 }, { "epoch": 2.939967105263158, "grad_norm": 63.315948486328125, "learning_rate": 7.063322368421053e-06, "loss": 0.6643, "step": 7150 }, { "epoch": 2.9440789473684212, "grad_norm": 11.84128475189209, "learning_rate": 7.0592105263157895e-06, "loss": 0.221, "step": 7160 }, { "epoch": 2.948190789473684, "grad_norm": 0.05120432749390602, "learning_rate": 7.055098684210527e-06, "loss": 0.5354, "step": 7170 }, { "epoch": 2.9523026315789473, "grad_norm": 0.002431727247312665, "learning_rate": 7.0509868421052644e-06, "loss": 0.2658, "step": 7180 }, { "epoch": 2.9564144736842106, "grad_norm": 0.0686754584312439, "learning_rate": 7.046875000000001e-06, "loss": 0.1946, "step": 7190 }, { "epoch": 2.9605263157894735, "grad_norm": 0.09617192298173904, "learning_rate": 7.042763157894738e-06, "loss": 0.1712, "step": 7200 }, { "epoch": 2.9646381578947367, "grad_norm": 45.63372039794922, "learning_rate": 7.038651315789474e-06, "loss": 0.1632, "step": 7210 }, { "epoch": 2.96875, "grad_norm": 0.09691905975341797, "learning_rate": 7.034539473684211e-06, "loss": 0.1844, "step": 7220 }, { "epoch": 2.9728618421052633, "grad_norm": 0.13701745867729187, "learning_rate": 7.0304276315789475e-06, "loss": 0.3205, "step": 7230 }, { "epoch": 2.9769736842105265, "grad_norm": 55.56039047241211, "learning_rate": 7.026315789473684e-06, "loss": 0.0395, "step": 7240 }, { "epoch": 2.9810855263157894, "grad_norm": 77.04964447021484, "learning_rate": 7.0222039473684215e-06, "loss": 0.2569, "step": 7250 }, { "epoch": 2.9851973684210527, "grad_norm": 0.3657775819301605, "learning_rate": 7.018092105263159e-06, "loss": 0.2819, "step": 7260 }, { "epoch": 2.989309210526316, "grad_norm": 15.447266578674316, "learning_rate": 7.013980263157896e-06, "loss": 0.3315, "step": 7270 }, { "epoch": 2.9934210526315788, "grad_norm": 3.8167836666107178, "learning_rate": 7.009868421052632e-06, "loss": 0.3787, "step": 7280 }, { "epoch": 2.997532894736842, "grad_norm": 53.00830841064453, "learning_rate": 7.005756578947369e-06, "loss": 0.1485, "step": 7290 }, { "epoch": 3.0016447368421053, "grad_norm": 0.0033909266348928213, "learning_rate": 7.001644736842106e-06, "loss": 0.4747, "step": 7300 }, { "epoch": 3.0057565789473686, "grad_norm": 5.891729831695557, "learning_rate": 6.997532894736843e-06, "loss": 0.1194, "step": 7310 }, { "epoch": 3.0098684210526314, "grad_norm": 16.72593879699707, "learning_rate": 6.9934210526315795e-06, "loss": 0.3064, "step": 7320 }, { "epoch": 3.0139802631578947, "grad_norm": 0.12237309664487839, "learning_rate": 6.989309210526316e-06, "loss": 0.0441, "step": 7330 }, { "epoch": 3.018092105263158, "grad_norm": 0.3185586631298065, "learning_rate": 6.985197368421053e-06, "loss": 0.01, "step": 7340 }, { "epoch": 3.0222039473684212, "grad_norm": 0.26841112971305847, "learning_rate": 6.981085526315789e-06, "loss": 0.1187, "step": 7350 }, { "epoch": 3.026315789473684, "grad_norm": 0.023548761382699013, "learning_rate": 6.976973684210528e-06, "loss": 0.1564, "step": 7360 }, { "epoch": 3.0304276315789473, "grad_norm": 4.014942646026611, "learning_rate": 6.972861842105264e-06, "loss": 0.0751, "step": 7370 }, { "epoch": 3.0345394736842106, "grad_norm": 0.199817955493927, "learning_rate": 6.968750000000001e-06, "loss": 0.2649, "step": 7380 }, { "epoch": 3.038651315789474, "grad_norm": 1.6823475360870361, "learning_rate": 6.9646381578947374e-06, "loss": 0.1527, "step": 7390 }, { "epoch": 3.0427631578947367, "grad_norm": 4.5417375564575195, "learning_rate": 6.960526315789474e-06, "loss": 0.1669, "step": 7400 }, { "epoch": 3.046875, "grad_norm": 0.4437451958656311, "learning_rate": 6.956414473684211e-06, "loss": 0.0024, "step": 7410 }, { "epoch": 3.0509868421052633, "grad_norm": 1.8537052869796753, "learning_rate": 6.952302631578948e-06, "loss": 0.1486, "step": 7420 }, { "epoch": 3.055098684210526, "grad_norm": 0.10921826213598251, "learning_rate": 6.948190789473685e-06, "loss": 0.1157, "step": 7430 }, { "epoch": 3.0592105263157894, "grad_norm": 0.14697521924972534, "learning_rate": 6.944078947368421e-06, "loss": 0.1615, "step": 7440 }, { "epoch": 3.0633223684210527, "grad_norm": 0.3917556703090668, "learning_rate": 6.939967105263159e-06, "loss": 0.1616, "step": 7450 }, { "epoch": 3.067434210526316, "grad_norm": 0.17994216084480286, "learning_rate": 6.935855263157895e-06, "loss": 0.322, "step": 7460 }, { "epoch": 3.0715460526315788, "grad_norm": 0.741560161113739, "learning_rate": 6.931743421052633e-06, "loss": 0.1822, "step": 7470 }, { "epoch": 3.075657894736842, "grad_norm": 2.4355533123016357, "learning_rate": 6.9276315789473695e-06, "loss": 0.5123, "step": 7480 }, { "epoch": 3.0797697368421053, "grad_norm": 5.086973667144775, "learning_rate": 6.923519736842106e-06, "loss": 0.3002, "step": 7490 }, { "epoch": 3.0838815789473686, "grad_norm": 0.2662551999092102, "learning_rate": 6.919407894736843e-06, "loss": 0.0768, "step": 7500 }, { "epoch": 3.0879934210526314, "grad_norm": 0.12516538798809052, "learning_rate": 6.915296052631579e-06, "loss": 0.2165, "step": 7510 }, { "epoch": 3.0921052631578947, "grad_norm": 0.0330631323158741, "learning_rate": 6.911184210526316e-06, "loss": 0.0027, "step": 7520 }, { "epoch": 3.096217105263158, "grad_norm": 91.03437805175781, "learning_rate": 6.9070723684210525e-06, "loss": 0.1534, "step": 7530 }, { "epoch": 3.1003289473684212, "grad_norm": 0.015870722010731697, "learning_rate": 6.902960526315789e-06, "loss": 0.0099, "step": 7540 }, { "epoch": 3.104440789473684, "grad_norm": 1.7429580688476562, "learning_rate": 6.898848684210527e-06, "loss": 0.1773, "step": 7550 }, { "epoch": 3.1085526315789473, "grad_norm": 16.66814422607422, "learning_rate": 6.894736842105264e-06, "loss": 0.0331, "step": 7560 }, { "epoch": 3.1126644736842106, "grad_norm": 0.08903591334819794, "learning_rate": 6.890625000000001e-06, "loss": 0.2967, "step": 7570 }, { "epoch": 3.116776315789474, "grad_norm": 24.660062789916992, "learning_rate": 6.886513157894737e-06, "loss": 0.63, "step": 7580 }, { "epoch": 3.1208881578947367, "grad_norm": 40.910377502441406, "learning_rate": 6.882401315789474e-06, "loss": 0.0253, "step": 7590 }, { "epoch": 3.125, "grad_norm": 0.23835232853889465, "learning_rate": 6.878289473684211e-06, "loss": 0.1164, "step": 7600 }, { "epoch": 3.1291118421052633, "grad_norm": 8.85913372039795, "learning_rate": 6.874177631578948e-06, "loss": 0.0507, "step": 7610 }, { "epoch": 3.1332236842105265, "grad_norm": 0.0227875467389822, "learning_rate": 6.8700657894736845e-06, "loss": 0.0667, "step": 7620 }, { "epoch": 3.1373355263157894, "grad_norm": 0.35952290892601013, "learning_rate": 6.865953947368421e-06, "loss": 0.035, "step": 7630 }, { "epoch": 3.1414473684210527, "grad_norm": 3.871885061264038, "learning_rate": 6.861842105263159e-06, "loss": 0.4717, "step": 7640 }, { "epoch": 3.145559210526316, "grad_norm": 0.0658380314707756, "learning_rate": 6.857730263157896e-06, "loss": 0.3083, "step": 7650 }, { "epoch": 3.1496710526315788, "grad_norm": 0.10795586556196213, "learning_rate": 6.853618421052633e-06, "loss": 0.0258, "step": 7660 }, { "epoch": 3.153782894736842, "grad_norm": 56.6161003112793, "learning_rate": 6.849506578947369e-06, "loss": 0.223, "step": 7670 }, { "epoch": 3.1578947368421053, "grad_norm": 72.3301010131836, "learning_rate": 6.845394736842106e-06, "loss": 0.1169, "step": 7680 }, { "epoch": 3.1620065789473686, "grad_norm": 0.7228354811668396, "learning_rate": 6.8412828947368425e-06, "loss": 0.0275, "step": 7690 }, { "epoch": 3.1661184210526314, "grad_norm": 19.717239379882812, "learning_rate": 6.837171052631579e-06, "loss": 0.0342, "step": 7700 }, { "epoch": 3.1702302631578947, "grad_norm": 2.3942067623138428, "learning_rate": 6.833059210526316e-06, "loss": 0.3279, "step": 7710 }, { "epoch": 3.174342105263158, "grad_norm": 68.46968078613281, "learning_rate": 6.828947368421053e-06, "loss": 0.0909, "step": 7720 }, { "epoch": 3.1784539473684212, "grad_norm": 6.748744010925293, "learning_rate": 6.82483552631579e-06, "loss": 0.3401, "step": 7730 }, { "epoch": 3.182565789473684, "grad_norm": 0.03430832922458649, "learning_rate": 6.820723684210527e-06, "loss": 0.2739, "step": 7740 }, { "epoch": 3.1866776315789473, "grad_norm": 80.0864486694336, "learning_rate": 6.816611842105264e-06, "loss": 0.2544, "step": 7750 }, { "epoch": 3.1907894736842106, "grad_norm": 67.80303192138672, "learning_rate": 6.8125e-06, "loss": 0.5098, "step": 7760 }, { "epoch": 3.1949013157894735, "grad_norm": 0.011421781964600086, "learning_rate": 6.808388157894738e-06, "loss": 0.0427, "step": 7770 }, { "epoch": 3.1990131578947367, "grad_norm": 0.10575248301029205, "learning_rate": 6.8042763157894745e-06, "loss": 0.1562, "step": 7780 }, { "epoch": 3.203125, "grad_norm": 21.22425651550293, "learning_rate": 6.800164473684211e-06, "loss": 0.13, "step": 7790 }, { "epoch": 3.2072368421052633, "grad_norm": 62.44038772583008, "learning_rate": 6.796052631578948e-06, "loss": 0.1369, "step": 7800 }, { "epoch": 3.2113486842105265, "grad_norm": 0.056282613426446915, "learning_rate": 6.791940789473684e-06, "loss": 0.0594, "step": 7810 }, { "epoch": 3.2154605263157894, "grad_norm": 0.05405473709106445, "learning_rate": 6.787828947368421e-06, "loss": 0.0014, "step": 7820 }, { "epoch": 3.2195723684210527, "grad_norm": 0.7672938704490662, "learning_rate": 6.783717105263159e-06, "loss": 0.2649, "step": 7830 }, { "epoch": 3.223684210526316, "grad_norm": 0.012963368557393551, "learning_rate": 6.779605263157896e-06, "loss": 0.003, "step": 7840 }, { "epoch": 3.2277960526315788, "grad_norm": 0.7032648324966431, "learning_rate": 6.7754934210526324e-06, "loss": 0.0383, "step": 7850 }, { "epoch": 3.231907894736842, "grad_norm": 0.00014193853712640703, "learning_rate": 6.771381578947369e-06, "loss": 0.2203, "step": 7860 }, { "epoch": 3.2360197368421053, "grad_norm": 0.16566841304302216, "learning_rate": 6.767269736842106e-06, "loss": 0.082, "step": 7870 }, { "epoch": 3.2401315789473686, "grad_norm": 0.011220881715416908, "learning_rate": 6.763157894736842e-06, "loss": 0.2433, "step": 7880 }, { "epoch": 3.2442434210526314, "grad_norm": 0.031912826001644135, "learning_rate": 6.75904605263158e-06, "loss": 0.1053, "step": 7890 }, { "epoch": 3.2483552631578947, "grad_norm": 0.00015638569311704487, "learning_rate": 6.754934210526316e-06, "loss": 0.0751, "step": 7900 }, { "epoch": 3.252467105263158, "grad_norm": 19.786230087280273, "learning_rate": 6.750822368421053e-06, "loss": 0.1573, "step": 7910 }, { "epoch": 3.2565789473684212, "grad_norm": 25.62039566040039, "learning_rate": 6.7467105263157895e-06, "loss": 0.0847, "step": 7920 }, { "epoch": 3.260690789473684, "grad_norm": 43.73009490966797, "learning_rate": 6.742598684210527e-06, "loss": 0.2365, "step": 7930 }, { "epoch": 3.2648026315789473, "grad_norm": 0.29684680700302124, "learning_rate": 6.7384868421052645e-06, "loss": 0.5742, "step": 7940 }, { "epoch": 3.2689144736842106, "grad_norm": 35.690513610839844, "learning_rate": 6.734375000000001e-06, "loss": 0.0705, "step": 7950 }, { "epoch": 3.2730263157894735, "grad_norm": 0.003651993814855814, "learning_rate": 6.730263157894738e-06, "loss": 0.0705, "step": 7960 }, { "epoch": 3.2771381578947367, "grad_norm": 0.0002773971064016223, "learning_rate": 6.726151315789474e-06, "loss": 0.2075, "step": 7970 }, { "epoch": 3.28125, "grad_norm": 0.35052403807640076, "learning_rate": 6.722039473684211e-06, "loss": 0.2401, "step": 7980 }, { "epoch": 3.2853618421052633, "grad_norm": 0.004375469870865345, "learning_rate": 6.7179276315789475e-06, "loss": 0.2805, "step": 7990 }, { "epoch": 3.2894736842105265, "grad_norm": 51.69245147705078, "learning_rate": 6.713815789473684e-06, "loss": 0.5704, "step": 8000 }, { "epoch": 3.2935855263157894, "grad_norm": 0.09066779911518097, "learning_rate": 6.709703947368421e-06, "loss": 0.1949, "step": 8010 }, { "epoch": 3.2976973684210527, "grad_norm": 20.805805206298828, "learning_rate": 6.705592105263159e-06, "loss": 0.5101, "step": 8020 }, { "epoch": 3.301809210526316, "grad_norm": 3.514925241470337, "learning_rate": 6.701480263157896e-06, "loss": 0.19, "step": 8030 }, { "epoch": 3.3059210526315788, "grad_norm": 1.247069239616394, "learning_rate": 6.697368421052632e-06, "loss": 0.2137, "step": 8040 }, { "epoch": 3.310032894736842, "grad_norm": 88.480712890625, "learning_rate": 6.693256578947369e-06, "loss": 0.4556, "step": 8050 }, { "epoch": 3.3141447368421053, "grad_norm": 0.18627463281154633, "learning_rate": 6.6891447368421054e-06, "loss": 0.1171, "step": 8060 }, { "epoch": 3.3182565789473686, "grad_norm": 0.06986451894044876, "learning_rate": 6.685032894736843e-06, "loss": 0.4195, "step": 8070 }, { "epoch": 3.3223684210526314, "grad_norm": 47.814727783203125, "learning_rate": 6.6809210526315795e-06, "loss": 0.1274, "step": 8080 }, { "epoch": 3.3264802631578947, "grad_norm": 63.75092315673828, "learning_rate": 6.676809210526316e-06, "loss": 0.4185, "step": 8090 }, { "epoch": 3.330592105263158, "grad_norm": 3.7314867973327637, "learning_rate": 6.672697368421053e-06, "loss": 0.1311, "step": 8100 }, { "epoch": 3.3347039473684212, "grad_norm": 16.43247413635254, "learning_rate": 6.668585526315789e-06, "loss": 0.2694, "step": 8110 }, { "epoch": 3.338815789473684, "grad_norm": 0.008259894326329231, "learning_rate": 6.664473684210528e-06, "loss": 0.2619, "step": 8120 }, { "epoch": 3.3429276315789473, "grad_norm": 0.09545180946588516, "learning_rate": 6.660361842105264e-06, "loss": 0.2593, "step": 8130 }, { "epoch": 3.3470394736842106, "grad_norm": 2.6562247276306152, "learning_rate": 6.656250000000001e-06, "loss": 0.1714, "step": 8140 }, { "epoch": 3.3511513157894735, "grad_norm": 1.3511881828308105, "learning_rate": 6.6521381578947375e-06, "loss": 0.1609, "step": 8150 }, { "epoch": 3.3552631578947367, "grad_norm": 0.45312735438346863, "learning_rate": 6.648026315789474e-06, "loss": 0.402, "step": 8160 }, { "epoch": 3.359375, "grad_norm": 0.008526019752025604, "learning_rate": 6.643914473684211e-06, "loss": 0.1503, "step": 8170 }, { "epoch": 3.3634868421052633, "grad_norm": 4.806290149688721, "learning_rate": 6.639802631578947e-06, "loss": 0.3165, "step": 8180 }, { "epoch": 3.3675986842105265, "grad_norm": 0.008733537979424, "learning_rate": 6.635690789473685e-06, "loss": 0.017, "step": 8190 }, { "epoch": 3.3717105263157894, "grad_norm": 11.375794410705566, "learning_rate": 6.631578947368421e-06, "loss": 0.0327, "step": 8200 }, { "epoch": 3.3758223684210527, "grad_norm": 1.7857224941253662, "learning_rate": 6.627467105263159e-06, "loss": 0.0189, "step": 8210 }, { "epoch": 3.379934210526316, "grad_norm": 7.3000102043151855, "learning_rate": 6.623355263157895e-06, "loss": 0.0515, "step": 8220 }, { "epoch": 3.3840460526315788, "grad_norm": 0.0007516624755226076, "learning_rate": 6.619243421052632e-06, "loss": 0.013, "step": 8230 }, { "epoch": 3.388157894736842, "grad_norm": 74.04230499267578, "learning_rate": 6.6151315789473695e-06, "loss": 0.2848, "step": 8240 }, { "epoch": 3.3922697368421053, "grad_norm": 0.06303539872169495, "learning_rate": 6.611019736842106e-06, "loss": 0.0307, "step": 8250 }, { "epoch": 3.3963815789473686, "grad_norm": 24.18715476989746, "learning_rate": 6.606907894736843e-06, "loss": 0.2593, "step": 8260 }, { "epoch": 3.4004934210526314, "grad_norm": 0.29844167828559875, "learning_rate": 6.602796052631579e-06, "loss": 0.1038, "step": 8270 }, { "epoch": 3.4046052631578947, "grad_norm": 0.1648898422718048, "learning_rate": 6.598684210526316e-06, "loss": 0.1039, "step": 8280 }, { "epoch": 3.408717105263158, "grad_norm": 1.139145016670227, "learning_rate": 6.5945723684210525e-06, "loss": 0.2466, "step": 8290 }, { "epoch": 3.4128289473684212, "grad_norm": 7.223357200622559, "learning_rate": 6.590460526315789e-06, "loss": 0.3805, "step": 8300 }, { "epoch": 3.416940789473684, "grad_norm": 0.002951091853901744, "learning_rate": 6.5863486842105274e-06, "loss": 0.2528, "step": 8310 }, { "epoch": 3.4210526315789473, "grad_norm": 0.04810519888997078, "learning_rate": 6.582236842105264e-06, "loss": 0.0248, "step": 8320 }, { "epoch": 3.4251644736842106, "grad_norm": 44.80435562133789, "learning_rate": 6.578125000000001e-06, "loss": 0.148, "step": 8330 }, { "epoch": 3.4292763157894735, "grad_norm": 0.6150760054588318, "learning_rate": 6.574013157894737e-06, "loss": 0.1437, "step": 8340 }, { "epoch": 3.4333881578947367, "grad_norm": 0.06286383420228958, "learning_rate": 6.569901315789474e-06, "loss": 0.2608, "step": 8350 }, { "epoch": 3.4375, "grad_norm": 0.003701314562931657, "learning_rate": 6.565789473684211e-06, "loss": 0.3989, "step": 8360 }, { "epoch": 3.4416118421052633, "grad_norm": 0.033227551728487015, "learning_rate": 6.561677631578948e-06, "loss": 0.2687, "step": 8370 }, { "epoch": 3.4457236842105265, "grad_norm": 74.94076538085938, "learning_rate": 6.5575657894736845e-06, "loss": 0.2218, "step": 8380 }, { "epoch": 3.4498355263157894, "grad_norm": 0.15582340955734253, "learning_rate": 6.553453947368421e-06, "loss": 0.0771, "step": 8390 }, { "epoch": 3.4539473684210527, "grad_norm": 0.005174733232706785, "learning_rate": 6.549342105263159e-06, "loss": 0.0989, "step": 8400 }, { "epoch": 3.458059210526316, "grad_norm": 7.792595863342285, "learning_rate": 6.545230263157896e-06, "loss": 0.0484, "step": 8410 }, { "epoch": 3.4621710526315788, "grad_norm": 0.11872321367263794, "learning_rate": 6.541118421052633e-06, "loss": 0.0035, "step": 8420 }, { "epoch": 3.466282894736842, "grad_norm": 5.740940832765773e-05, "learning_rate": 6.537006578947369e-06, "loss": 0.2619, "step": 8430 }, { "epoch": 3.4703947368421053, "grad_norm": 0.37503787875175476, "learning_rate": 6.532894736842106e-06, "loss": 0.2062, "step": 8440 }, { "epoch": 3.4745065789473686, "grad_norm": 12.798948287963867, "learning_rate": 6.5287828947368425e-06, "loss": 0.3276, "step": 8450 }, { "epoch": 3.4786184210526314, "grad_norm": 4.535250663757324, "learning_rate": 6.524671052631579e-06, "loss": 0.1852, "step": 8460 }, { "epoch": 3.4827302631578947, "grad_norm": 0.47860297560691833, "learning_rate": 6.520559210526316e-06, "loss": 0.4313, "step": 8470 }, { "epoch": 3.486842105263158, "grad_norm": 61.79635238647461, "learning_rate": 6.516447368421053e-06, "loss": 0.318, "step": 8480 }, { "epoch": 3.4909539473684212, "grad_norm": 9.027344703674316, "learning_rate": 6.51233552631579e-06, "loss": 0.1673, "step": 8490 }, { "epoch": 3.495065789473684, "grad_norm": 0.1292915791273117, "learning_rate": 6.508223684210527e-06, "loss": 0.3923, "step": 8500 }, { "epoch": 3.4991776315789473, "grad_norm": 0.0020961903501302004, "learning_rate": 6.504111842105264e-06, "loss": 0.2699, "step": 8510 }, { "epoch": 3.5032894736842106, "grad_norm": 0.14688652753829956, "learning_rate": 6.5000000000000004e-06, "loss": 0.2736, "step": 8520 }, { "epoch": 3.5074013157894735, "grad_norm": 61.5792121887207, "learning_rate": 6.495888157894737e-06, "loss": 0.1765, "step": 8530 }, { "epoch": 3.5115131578947367, "grad_norm": 0.07907792925834656, "learning_rate": 6.4917763157894745e-06, "loss": 0.158, "step": 8540 }, { "epoch": 3.515625, "grad_norm": 0.29822736978530884, "learning_rate": 6.487664473684211e-06, "loss": 0.0808, "step": 8550 }, { "epoch": 3.5197368421052633, "grad_norm": 5.326796054840088, "learning_rate": 6.483552631578948e-06, "loss": 0.0521, "step": 8560 }, { "epoch": 3.5238486842105265, "grad_norm": 46.79924011230469, "learning_rate": 6.479440789473684e-06, "loss": 0.5624, "step": 8570 }, { "epoch": 3.5279605263157894, "grad_norm": 0.1057400181889534, "learning_rate": 6.475328947368421e-06, "loss": 0.3374, "step": 8580 }, { "epoch": 3.5320723684210527, "grad_norm": 2.302854061126709, "learning_rate": 6.471217105263159e-06, "loss": 0.1588, "step": 8590 }, { "epoch": 3.536184210526316, "grad_norm": 0.31161850690841675, "learning_rate": 6.467105263157896e-06, "loss": 0.4272, "step": 8600 }, { "epoch": 3.5402960526315788, "grad_norm": 1.1857142448425293, "learning_rate": 6.4629934210526325e-06, "loss": 0.3116, "step": 8610 }, { "epoch": 3.544407894736842, "grad_norm": 74.4317398071289, "learning_rate": 6.458881578947369e-06, "loss": 0.0994, "step": 8620 }, { "epoch": 3.5485197368421053, "grad_norm": 1.3740957975387573, "learning_rate": 6.454769736842106e-06, "loss": 0.5356, "step": 8630 }, { "epoch": 3.5526315789473686, "grad_norm": 66.15660858154297, "learning_rate": 6.450657894736842e-06, "loss": 0.096, "step": 8640 }, { "epoch": 3.5567434210526314, "grad_norm": 130.39633178710938, "learning_rate": 6.446546052631579e-06, "loss": 0.2427, "step": 8650 }, { "epoch": 3.5608552631578947, "grad_norm": 0.7252002954483032, "learning_rate": 6.442434210526316e-06, "loss": 0.0773, "step": 8660 }, { "epoch": 3.564967105263158, "grad_norm": 0.32898321747779846, "learning_rate": 6.438322368421053e-06, "loss": 0.2922, "step": 8670 }, { "epoch": 3.5690789473684212, "grad_norm": 0.4259633719921112, "learning_rate": 6.4342105263157896e-06, "loss": 0.1903, "step": 8680 }, { "epoch": 3.573190789473684, "grad_norm": 0.02078865095973015, "learning_rate": 6.430098684210527e-06, "loss": 0.0631, "step": 8690 }, { "epoch": 3.5773026315789473, "grad_norm": 5.149608612060547, "learning_rate": 6.425986842105264e-06, "loss": 0.0189, "step": 8700 }, { "epoch": 3.5814144736842106, "grad_norm": 0.003035628003999591, "learning_rate": 6.421875000000001e-06, "loss": 0.339, "step": 8710 }, { "epoch": 3.5855263157894735, "grad_norm": 81.40657043457031, "learning_rate": 6.417763157894738e-06, "loss": 0.2999, "step": 8720 }, { "epoch": 3.5896381578947367, "grad_norm": 5.7936906814575195, "learning_rate": 6.413651315789474e-06, "loss": 0.1581, "step": 8730 }, { "epoch": 3.59375, "grad_norm": 58.010353088378906, "learning_rate": 6.409539473684211e-06, "loss": 0.2292, "step": 8740 }, { "epoch": 3.5978618421052633, "grad_norm": 10.935487747192383, "learning_rate": 6.4054276315789475e-06, "loss": 0.0143, "step": 8750 }, { "epoch": 3.6019736842105265, "grad_norm": 0.09966960549354553, "learning_rate": 6.401315789473684e-06, "loss": 0.105, "step": 8760 }, { "epoch": 3.6060855263157894, "grad_norm": 0.00877199787646532, "learning_rate": 6.397203947368421e-06, "loss": 0.2848, "step": 8770 }, { "epoch": 3.6101973684210527, "grad_norm": 81.7994613647461, "learning_rate": 6.393092105263159e-06, "loss": 0.1348, "step": 8780 }, { "epoch": 3.614309210526316, "grad_norm": 0.004074628930538893, "learning_rate": 6.388980263157896e-06, "loss": 0.1723, "step": 8790 }, { "epoch": 3.6184210526315788, "grad_norm": 1.1934515237808228, "learning_rate": 6.384868421052632e-06, "loss": 0.0017, "step": 8800 }, { "epoch": 3.622532894736842, "grad_norm": 0.01399979181587696, "learning_rate": 6.380756578947369e-06, "loss": 0.2224, "step": 8810 }, { "epoch": 3.6266447368421053, "grad_norm": 115.63423156738281, "learning_rate": 6.3766447368421055e-06, "loss": 0.3234, "step": 8820 }, { "epoch": 3.6307565789473686, "grad_norm": 0.49743548035621643, "learning_rate": 6.372532894736843e-06, "loss": 0.5291, "step": 8830 }, { "epoch": 3.6348684210526314, "grad_norm": 1.7224241495132446, "learning_rate": 6.3684210526315795e-06, "loss": 0.2441, "step": 8840 }, { "epoch": 3.6389802631578947, "grad_norm": 56.73593521118164, "learning_rate": 6.364309210526316e-06, "loss": 0.1411, "step": 8850 }, { "epoch": 3.643092105263158, "grad_norm": 0.0021920078434050083, "learning_rate": 6.360197368421053e-06, "loss": 0.2172, "step": 8860 }, { "epoch": 3.6472039473684212, "grad_norm": 0.005450670141726732, "learning_rate": 6.356085526315789e-06, "loss": 0.0721, "step": 8870 }, { "epoch": 3.651315789473684, "grad_norm": 0.0028689634054899216, "learning_rate": 6.351973684210528e-06, "loss": 0.033, "step": 8880 }, { "epoch": 3.6554276315789473, "grad_norm": 41.87671661376953, "learning_rate": 6.347861842105264e-06, "loss": 0.1019, "step": 8890 }, { "epoch": 3.6595394736842106, "grad_norm": 0.1014520600438118, "learning_rate": 6.343750000000001e-06, "loss": 0.0836, "step": 8900 }, { "epoch": 3.6636513157894735, "grad_norm": 0.8657021522521973, "learning_rate": 6.3396381578947375e-06, "loss": 0.0689, "step": 8910 }, { "epoch": 3.6677631578947367, "grad_norm": 6.391144275665283, "learning_rate": 6.335526315789474e-06, "loss": 0.2655, "step": 8920 }, { "epoch": 3.671875, "grad_norm": 25.194692611694336, "learning_rate": 6.331414473684211e-06, "loss": 0.2248, "step": 8930 }, { "epoch": 3.6759868421052633, "grad_norm": 87.24070739746094, "learning_rate": 6.327302631578947e-06, "loss": 0.1769, "step": 8940 }, { "epoch": 3.6800986842105265, "grad_norm": 84.9614486694336, "learning_rate": 6.323190789473685e-06, "loss": 0.1785, "step": 8950 }, { "epoch": 3.6842105263157894, "grad_norm": 3.7174837589263916, "learning_rate": 6.319078947368421e-06, "loss": 0.1043, "step": 8960 }, { "epoch": 3.6883223684210527, "grad_norm": 19.11334228515625, "learning_rate": 6.314967105263159e-06, "loss": 0.1056, "step": 8970 }, { "epoch": 3.692434210526316, "grad_norm": 2.9218459129333496, "learning_rate": 6.3108552631578954e-06, "loss": 0.0715, "step": 8980 }, { "epoch": 3.6965460526315788, "grad_norm": 0.7350709438323975, "learning_rate": 6.306743421052632e-06, "loss": 0.1103, "step": 8990 }, { "epoch": 3.700657894736842, "grad_norm": 25.493600845336914, "learning_rate": 6.3026315789473695e-06, "loss": 0.4917, "step": 9000 }, { "epoch": 3.7047697368421053, "grad_norm": 0.3064211905002594, "learning_rate": 6.298519736842106e-06, "loss": 0.0834, "step": 9010 }, { "epoch": 3.7088815789473686, "grad_norm": 0.8027281165122986, "learning_rate": 6.294407894736843e-06, "loss": 0.2116, "step": 9020 }, { "epoch": 3.7129934210526314, "grad_norm": 0.023541197180747986, "learning_rate": 6.290296052631579e-06, "loss": 0.2796, "step": 9030 }, { "epoch": 3.7171052631578947, "grad_norm": 0.007667736150324345, "learning_rate": 6.286184210526316e-06, "loss": 0.107, "step": 9040 }, { "epoch": 3.721217105263158, "grad_norm": 22.70380973815918, "learning_rate": 6.2820723684210526e-06, "loss": 0.1017, "step": 9050 }, { "epoch": 3.7253289473684212, "grad_norm": 0.20407456159591675, "learning_rate": 6.277960526315789e-06, "loss": 0.0103, "step": 9060 }, { "epoch": 3.729440789473684, "grad_norm": 75.71296691894531, "learning_rate": 6.2738486842105275e-06, "loss": 0.4619, "step": 9070 }, { "epoch": 3.7335526315789473, "grad_norm": 58.81220626831055, "learning_rate": 6.269736842105264e-06, "loss": 0.1691, "step": 9080 }, { "epoch": 3.7376644736842106, "grad_norm": 54.02885818481445, "learning_rate": 6.265625000000001e-06, "loss": 0.2037, "step": 9090 }, { "epoch": 3.7417763157894735, "grad_norm": 1.0572305917739868, "learning_rate": 6.261513157894737e-06, "loss": 0.0768, "step": 9100 }, { "epoch": 3.7458881578947367, "grad_norm": 14.10479736328125, "learning_rate": 6.257401315789474e-06, "loss": 0.3359, "step": 9110 }, { "epoch": 3.75, "grad_norm": 0.23570677638053894, "learning_rate": 6.2532894736842105e-06, "loss": 0.0442, "step": 9120 }, { "epoch": 3.7541118421052633, "grad_norm": 1.2006131410598755, "learning_rate": 6.249177631578948e-06, "loss": 0.1693, "step": 9130 }, { "epoch": 3.7582236842105265, "grad_norm": 65.66853332519531, "learning_rate": 6.2450657894736846e-06, "loss": 0.1355, "step": 9140 }, { "epoch": 3.7623355263157894, "grad_norm": 62.854225158691406, "learning_rate": 6.240953947368421e-06, "loss": 0.1069, "step": 9150 }, { "epoch": 3.7664473684210527, "grad_norm": 107.98965454101562, "learning_rate": 6.236842105263159e-06, "loss": 0.1923, "step": 9160 }, { "epoch": 3.770559210526316, "grad_norm": 0.35728099942207336, "learning_rate": 6.232730263157895e-06, "loss": 0.1733, "step": 9170 }, { "epoch": 3.7746710526315788, "grad_norm": 8.636373519897461, "learning_rate": 6.228618421052633e-06, "loss": 0.2717, "step": 9180 }, { "epoch": 3.778782894736842, "grad_norm": 0.19581067562103271, "learning_rate": 6.224506578947369e-06, "loss": 0.3486, "step": 9190 }, { "epoch": 3.7828947368421053, "grad_norm": 0.1582966297864914, "learning_rate": 6.220394736842106e-06, "loss": 0.0158, "step": 9200 }, { "epoch": 3.7870065789473686, "grad_norm": 13.618598937988281, "learning_rate": 6.2162828947368425e-06, "loss": 0.6965, "step": 9210 }, { "epoch": 3.7911184210526314, "grad_norm": 5.897958755493164, "learning_rate": 6.212171052631579e-06, "loss": 0.2573, "step": 9220 }, { "epoch": 3.7952302631578947, "grad_norm": 59.38594055175781, "learning_rate": 6.208059210526316e-06, "loss": 0.0541, "step": 9230 }, { "epoch": 3.799342105263158, "grad_norm": 23.412710189819336, "learning_rate": 6.203947368421052e-06, "loss": 0.1623, "step": 9240 }, { "epoch": 3.8034539473684212, "grad_norm": 0.25934499502182007, "learning_rate": 6.19983552631579e-06, "loss": 0.2866, "step": 9250 }, { "epoch": 3.807565789473684, "grad_norm": 0.5576783418655396, "learning_rate": 6.195723684210527e-06, "loss": 0.0381, "step": 9260 }, { "epoch": 3.8116776315789473, "grad_norm": 0.00027464405866339803, "learning_rate": 6.191611842105264e-06, "loss": 0.4702, "step": 9270 }, { "epoch": 3.8157894736842106, "grad_norm": 56.96987533569336, "learning_rate": 6.1875000000000005e-06, "loss": 0.3138, "step": 9280 }, { "epoch": 3.8199013157894735, "grad_norm": 8.449546813964844, "learning_rate": 6.183388157894737e-06, "loss": 0.1577, "step": 9290 }, { "epoch": 3.8240131578947367, "grad_norm": 0.0012974452693015337, "learning_rate": 6.1792763157894745e-06, "loss": 0.103, "step": 9300 }, { "epoch": 3.828125, "grad_norm": 0.14060595631599426, "learning_rate": 6.175164473684211e-06, "loss": 0.1491, "step": 9310 }, { "epoch": 3.8322368421052633, "grad_norm": 4.392583847045898, "learning_rate": 6.171052631578948e-06, "loss": 0.0416, "step": 9320 }, { "epoch": 3.8363486842105265, "grad_norm": 95.75727844238281, "learning_rate": 6.166940789473684e-06, "loss": 0.2727, "step": 9330 }, { "epoch": 3.8404605263157894, "grad_norm": 1.871319055557251, "learning_rate": 6.162828947368421e-06, "loss": 0.188, "step": 9340 }, { "epoch": 3.8445723684210527, "grad_norm": 14.953452110290527, "learning_rate": 6.158717105263159e-06, "loss": 0.4324, "step": 9350 }, { "epoch": 3.848684210526316, "grad_norm": 55.11267852783203, "learning_rate": 6.154605263157896e-06, "loss": 0.0427, "step": 9360 }, { "epoch": 3.8527960526315788, "grad_norm": 0.2453383207321167, "learning_rate": 6.1504934210526325e-06, "loss": 0.1925, "step": 9370 }, { "epoch": 3.856907894736842, "grad_norm": 0.05007212609052658, "learning_rate": 6.146381578947369e-06, "loss": 0.2459, "step": 9380 }, { "epoch": 3.8610197368421053, "grad_norm": 129.4027862548828, "learning_rate": 6.142269736842106e-06, "loss": 0.3909, "step": 9390 }, { "epoch": 3.8651315789473686, "grad_norm": 0.061377912759780884, "learning_rate": 6.138157894736842e-06, "loss": 0.1726, "step": 9400 }, { "epoch": 3.8692434210526314, "grad_norm": 0.07050663232803345, "learning_rate": 6.134046052631579e-06, "loss": 0.2633, "step": 9410 }, { "epoch": 3.8733552631578947, "grad_norm": 0.820439875125885, "learning_rate": 6.129934210526316e-06, "loss": 0.3758, "step": 9420 }, { "epoch": 3.877467105263158, "grad_norm": 64.52366638183594, "learning_rate": 6.125822368421053e-06, "loss": 0.0713, "step": 9430 }, { "epoch": 3.8815789473684212, "grad_norm": 53.69047927856445, "learning_rate": 6.12171052631579e-06, "loss": 0.2844, "step": 9440 }, { "epoch": 3.885690789473684, "grad_norm": 0.0030439761467278004, "learning_rate": 6.117598684210527e-06, "loss": 0.1889, "step": 9450 }, { "epoch": 3.8898026315789473, "grad_norm": 0.0717514380812645, "learning_rate": 6.113486842105264e-06, "loss": 0.4461, "step": 9460 }, { "epoch": 3.8939144736842106, "grad_norm": 0.015020138584077358, "learning_rate": 6.109375000000001e-06, "loss": 0.115, "step": 9470 }, { "epoch": 3.8980263157894735, "grad_norm": 2.4814159870147705, "learning_rate": 6.105263157894738e-06, "loss": 0.2168, "step": 9480 }, { "epoch": 3.9021381578947367, "grad_norm": 1.2755320072174072, "learning_rate": 6.101151315789474e-06, "loss": 0.0417, "step": 9490 }, { "epoch": 3.90625, "grad_norm": 0.14059193432331085, "learning_rate": 6.097039473684211e-06, "loss": 0.1493, "step": 9500 }, { "epoch": 3.9103618421052633, "grad_norm": 0.1622990518808365, "learning_rate": 6.0929276315789476e-06, "loss": 0.0046, "step": 9510 }, { "epoch": 3.9144736842105265, "grad_norm": 0.36548370122909546, "learning_rate": 6.088815789473684e-06, "loss": 0.0143, "step": 9520 }, { "epoch": 3.9185855263157894, "grad_norm": 6.167750835418701, "learning_rate": 6.084703947368421e-06, "loss": 0.2583, "step": 9530 }, { "epoch": 3.9226973684210527, "grad_norm": 0.017177041620016098, "learning_rate": 6.080592105263159e-06, "loss": 0.2583, "step": 9540 }, { "epoch": 3.926809210526316, "grad_norm": 7.4479875564575195, "learning_rate": 6.076480263157896e-06, "loss": 0.0245, "step": 9550 }, { "epoch": 3.9309210526315788, "grad_norm": 6.392214775085449, "learning_rate": 6.072368421052632e-06, "loss": 0.0585, "step": 9560 }, { "epoch": 3.935032894736842, "grad_norm": 45.556400299072266, "learning_rate": 6.068256578947369e-06, "loss": 0.1436, "step": 9570 }, { "epoch": 3.9391447368421053, "grad_norm": 0.0030166308861225843, "learning_rate": 6.0641447368421055e-06, "loss": 0.3354, "step": 9580 }, { "epoch": 3.9432565789473686, "grad_norm": 0.04616885259747505, "learning_rate": 6.060032894736842e-06, "loss": 0.3983, "step": 9590 }, { "epoch": 3.9473684210526314, "grad_norm": 50.227230072021484, "learning_rate": 6.0559210526315796e-06, "loss": 0.2884, "step": 9600 }, { "epoch": 3.9514802631578947, "grad_norm": 74.54116821289062, "learning_rate": 6.051809210526316e-06, "loss": 0.0673, "step": 9610 }, { "epoch": 3.955592105263158, "grad_norm": 2.7192249298095703, "learning_rate": 6.047697368421053e-06, "loss": 0.0884, "step": 9620 }, { "epoch": 3.9597039473684212, "grad_norm": 9.346713066101074, "learning_rate": 6.043585526315789e-06, "loss": 0.0085, "step": 9630 }, { "epoch": 3.963815789473684, "grad_norm": 67.16130065917969, "learning_rate": 6.039473684210527e-06, "loss": 0.6519, "step": 9640 }, { "epoch": 3.9679276315789473, "grad_norm": 0.004526804201304913, "learning_rate": 6.035361842105264e-06, "loss": 0.0041, "step": 9650 }, { "epoch": 3.9720394736842106, "grad_norm": 22.933185577392578, "learning_rate": 6.031250000000001e-06, "loss": 0.0321, "step": 9660 }, { "epoch": 3.9761513157894735, "grad_norm": 0.031579192727804184, "learning_rate": 6.0271381578947375e-06, "loss": 0.0978, "step": 9670 }, { "epoch": 3.9802631578947367, "grad_norm": 0.30152076482772827, "learning_rate": 6.023026315789474e-06, "loss": 0.2345, "step": 9680 }, { "epoch": 3.984375, "grad_norm": 0.02326991781592369, "learning_rate": 6.018914473684211e-06, "loss": 0.1673, "step": 9690 }, { "epoch": 3.9884868421052633, "grad_norm": 0.11922337114810944, "learning_rate": 6.014802631578947e-06, "loss": 0.5942, "step": 9700 }, { "epoch": 3.9925986842105265, "grad_norm": 0.20456120371818542, "learning_rate": 6.010690789473684e-06, "loss": 0.2563, "step": 9710 }, { "epoch": 3.9967105263157894, "grad_norm": 0.08687052130699158, "learning_rate": 6.006578947368421e-06, "loss": 0.1033, "step": 9720 }, { "epoch": 4.000822368421052, "grad_norm": 0.08968238532543182, "learning_rate": 6.002467105263159e-06, "loss": 0.1698, "step": 9730 }, { "epoch": 4.004934210526316, "grad_norm": 0.6397474408149719, "learning_rate": 5.9983552631578955e-06, "loss": 0.0484, "step": 9740 }, { "epoch": 4.009046052631579, "grad_norm": 4.620758056640625, "learning_rate": 5.994243421052632e-06, "loss": 0.1483, "step": 9750 }, { "epoch": 4.0131578947368425, "grad_norm": 10.348681449890137, "learning_rate": 5.990131578947369e-06, "loss": 0.0073, "step": 9760 }, { "epoch": 4.017269736842105, "grad_norm": 62.68147659301758, "learning_rate": 5.986019736842106e-06, "loss": 0.0964, "step": 9770 }, { "epoch": 4.021381578947368, "grad_norm": 5.771296501159668, "learning_rate": 5.981907894736843e-06, "loss": 0.0631, "step": 9780 }, { "epoch": 4.025493421052632, "grad_norm": 63.21457290649414, "learning_rate": 5.977796052631579e-06, "loss": 0.3454, "step": 9790 }, { "epoch": 4.029605263157895, "grad_norm": 0.0331178717315197, "learning_rate": 5.973684210526316e-06, "loss": 0.0043, "step": 9800 }, { "epoch": 4.0337171052631575, "grad_norm": 13.591999053955078, "learning_rate": 5.969572368421053e-06, "loss": 0.01, "step": 9810 }, { "epoch": 4.037828947368421, "grad_norm": 2.523550033569336, "learning_rate": 5.965460526315789e-06, "loss": 0.1164, "step": 9820 }, { "epoch": 4.041940789473684, "grad_norm": 0.0010173360351473093, "learning_rate": 5.9613486842105275e-06, "loss": 0.0702, "step": 9830 }, { "epoch": 4.046052631578948, "grad_norm": 0.008227844722568989, "learning_rate": 5.957236842105264e-06, "loss": 0.0444, "step": 9840 }, { "epoch": 4.050164473684211, "grad_norm": 51.15441131591797, "learning_rate": 5.953125000000001e-06, "loss": 0.0465, "step": 9850 }, { "epoch": 4.0542763157894735, "grad_norm": 0.08978760987520218, "learning_rate": 5.949013157894737e-06, "loss": 0.2733, "step": 9860 }, { "epoch": 4.058388157894737, "grad_norm": 0.0032470934092998505, "learning_rate": 5.944901315789474e-06, "loss": 0.0077, "step": 9870 }, { "epoch": 4.0625, "grad_norm": 2.342414617538452, "learning_rate": 5.9407894736842105e-06, "loss": 0.023, "step": 9880 }, { "epoch": 4.066611842105263, "grad_norm": 0.2732146978378296, "learning_rate": 5.936677631578948e-06, "loss": 0.1348, "step": 9890 }, { "epoch": 4.0707236842105265, "grad_norm": 0.014328996650874615, "learning_rate": 5.932565789473685e-06, "loss": 0.0011, "step": 9900 }, { "epoch": 4.074835526315789, "grad_norm": 56.14218521118164, "learning_rate": 5.928453947368421e-06, "loss": 0.1015, "step": 9910 }, { "epoch": 4.078947368421052, "grad_norm": 0.04061206430196762, "learning_rate": 5.924342105263159e-06, "loss": 0.3239, "step": 9920 }, { "epoch": 4.083059210526316, "grad_norm": 0.8352648019790649, "learning_rate": 5.920230263157895e-06, "loss": 0.0036, "step": 9930 }, { "epoch": 4.087171052631579, "grad_norm": 8.8486909866333, "learning_rate": 5.916118421052633e-06, "loss": 0.0076, "step": 9940 }, { "epoch": 4.0912828947368425, "grad_norm": 88.49449157714844, "learning_rate": 5.912006578947369e-06, "loss": 0.179, "step": 9950 }, { "epoch": 4.095394736842105, "grad_norm": 2.3278348445892334, "learning_rate": 5.907894736842106e-06, "loss": 0.1412, "step": 9960 }, { "epoch": 4.099506578947368, "grad_norm": 17.677732467651367, "learning_rate": 5.9037828947368425e-06, "loss": 0.0092, "step": 9970 }, { "epoch": 4.103618421052632, "grad_norm": 1.240334391593933, "learning_rate": 5.899671052631579e-06, "loss": 0.2093, "step": 9980 }, { "epoch": 4.107730263157895, "grad_norm": 0.012150059454143047, "learning_rate": 5.895559210526316e-06, "loss": 0.0079, "step": 9990 }, { "epoch": 4.1118421052631575, "grad_norm": 76.60591125488281, "learning_rate": 5.891447368421052e-06, "loss": 0.2322, "step": 10000 }, { "epoch": 4.115953947368421, "grad_norm": 0.41860559582710266, "learning_rate": 5.88733552631579e-06, "loss": 0.1039, "step": 10010 }, { "epoch": 4.120065789473684, "grad_norm": 0.010364804416894913, "learning_rate": 5.883223684210527e-06, "loss": 0.0324, "step": 10020 }, { "epoch": 4.124177631578948, "grad_norm": 14.311861038208008, "learning_rate": 5.879111842105264e-06, "loss": 0.0699, "step": 10030 }, { "epoch": 4.128289473684211, "grad_norm": 0.5761501789093018, "learning_rate": 5.8750000000000005e-06, "loss": 0.5303, "step": 10040 }, { "epoch": 4.1324013157894735, "grad_norm": 1.0431439876556396, "learning_rate": 5.870888157894737e-06, "loss": 0.0503, "step": 10050 }, { "epoch": 4.136513157894737, "grad_norm": 0.0012574722059071064, "learning_rate": 5.8667763157894746e-06, "loss": 0.0505, "step": 10060 }, { "epoch": 4.140625, "grad_norm": 1.2190923690795898, "learning_rate": 5.862664473684211e-06, "loss": 0.0039, "step": 10070 }, { "epoch": 4.144736842105263, "grad_norm": 0.010865393094718456, "learning_rate": 5.858552631578948e-06, "loss": 0.0199, "step": 10080 }, { "epoch": 4.1488486842105265, "grad_norm": 0.5229134559631348, "learning_rate": 5.854440789473684e-06, "loss": 0.3392, "step": 10090 }, { "epoch": 4.152960526315789, "grad_norm": 18.424875259399414, "learning_rate": 5.850328947368421e-06, "loss": 0.2647, "step": 10100 }, { "epoch": 4.157072368421052, "grad_norm": 0.25895851850509644, "learning_rate": 5.846217105263159e-06, "loss": 0.1165, "step": 10110 }, { "epoch": 4.161184210526316, "grad_norm": 53.10039520263672, "learning_rate": 5.842105263157896e-06, "loss": 0.0722, "step": 10120 }, { "epoch": 4.165296052631579, "grad_norm": 0.66398024559021, "learning_rate": 5.8379934210526325e-06, "loss": 0.1419, "step": 10130 }, { "epoch": 4.1694078947368425, "grad_norm": 0.014206175692379475, "learning_rate": 5.833881578947369e-06, "loss": 0.2225, "step": 10140 }, { "epoch": 4.173519736842105, "grad_norm": 5.291435718536377, "learning_rate": 5.829769736842106e-06, "loss": 0.0228, "step": 10150 }, { "epoch": 4.177631578947368, "grad_norm": 3.0082762241363525, "learning_rate": 5.825657894736842e-06, "loss": 0.2913, "step": 10160 }, { "epoch": 4.181743421052632, "grad_norm": 0.08264022320508957, "learning_rate": 5.821546052631579e-06, "loss": 0.3597, "step": 10170 }, { "epoch": 4.185855263157895, "grad_norm": 35.833770751953125, "learning_rate": 5.8174342105263156e-06, "loss": 0.0419, "step": 10180 }, { "epoch": 4.1899671052631575, "grad_norm": 85.25164031982422, "learning_rate": 5.813322368421053e-06, "loss": 0.2057, "step": 10190 }, { "epoch": 4.194078947368421, "grad_norm": 0.0025984339881688356, "learning_rate": 5.80921052631579e-06, "loss": 0.3559, "step": 10200 }, { "epoch": 4.198190789473684, "grad_norm": 0.2660520672798157, "learning_rate": 5.805098684210527e-06, "loss": 0.0053, "step": 10210 }, { "epoch": 4.202302631578948, "grad_norm": 0.7417953014373779, "learning_rate": 5.800986842105264e-06, "loss": 0.1898, "step": 10220 }, { "epoch": 4.206414473684211, "grad_norm": 0.37967509031295776, "learning_rate": 5.796875e-06, "loss": 0.27, "step": 10230 }, { "epoch": 4.2105263157894735, "grad_norm": 0.0646011158823967, "learning_rate": 5.792763157894738e-06, "loss": 0.5033, "step": 10240 }, { "epoch": 4.214638157894737, "grad_norm": 0.042865119874477386, "learning_rate": 5.788651315789474e-06, "loss": 0.0579, "step": 10250 }, { "epoch": 4.21875, "grad_norm": 0.905980110168457, "learning_rate": 5.784539473684211e-06, "loss": 0.0618, "step": 10260 }, { "epoch": 4.222861842105263, "grad_norm": 0.011512444354593754, "learning_rate": 5.780427631578948e-06, "loss": 0.0534, "step": 10270 }, { "epoch": 4.2269736842105265, "grad_norm": 0.009037915617227554, "learning_rate": 5.776315789473684e-06, "loss": 0.1521, "step": 10280 }, { "epoch": 4.231085526315789, "grad_norm": 44.18505859375, "learning_rate": 5.772203947368421e-06, "loss": 0.0802, "step": 10290 }, { "epoch": 4.235197368421052, "grad_norm": 0.24068020284175873, "learning_rate": 5.768092105263159e-06, "loss": 0.0494, "step": 10300 }, { "epoch": 4.239309210526316, "grad_norm": 3.5520458221435547, "learning_rate": 5.763980263157896e-06, "loss": 0.0715, "step": 10310 }, { "epoch": 4.243421052631579, "grad_norm": 0.0010897587053477764, "learning_rate": 5.759868421052632e-06, "loss": 0.087, "step": 10320 }, { "epoch": 4.2475328947368425, "grad_norm": 5.748364448547363, "learning_rate": 5.755756578947369e-06, "loss": 0.0721, "step": 10330 }, { "epoch": 4.251644736842105, "grad_norm": 0.019158218055963516, "learning_rate": 5.7516447368421055e-06, "loss": 0.458, "step": 10340 }, { "epoch": 4.255756578947368, "grad_norm": 0.030008016154170036, "learning_rate": 5.747532894736842e-06, "loss": 0.052, "step": 10350 }, { "epoch": 4.259868421052632, "grad_norm": 37.76634216308594, "learning_rate": 5.74342105263158e-06, "loss": 0.275, "step": 10360 }, { "epoch": 4.263980263157895, "grad_norm": 0.003279157681390643, "learning_rate": 5.739309210526316e-06, "loss": 0.0036, "step": 10370 }, { "epoch": 4.2680921052631575, "grad_norm": 0.0217817984521389, "learning_rate": 5.735197368421053e-06, "loss": 0.0017, "step": 10380 }, { "epoch": 4.272203947368421, "grad_norm": 0.6077308058738708, "learning_rate": 5.731085526315789e-06, "loss": 0.0008, "step": 10390 }, { "epoch": 4.276315789473684, "grad_norm": 0.4642079770565033, "learning_rate": 5.726973684210527e-06, "loss": 0.2043, "step": 10400 }, { "epoch": 4.280427631578947, "grad_norm": 0.002989094937220216, "learning_rate": 5.722861842105264e-06, "loss": 0.052, "step": 10410 }, { "epoch": 4.284539473684211, "grad_norm": 0.0273353923112154, "learning_rate": 5.719161184210527e-06, "loss": 0.3025, "step": 10420 }, { "epoch": 4.2886513157894735, "grad_norm": 77.05641174316406, "learning_rate": 5.715049342105263e-06, "loss": 0.2568, "step": 10430 }, { "epoch": 4.292763157894737, "grad_norm": 0.10438290983438492, "learning_rate": 5.7109375e-06, "loss": 0.142, "step": 10440 }, { "epoch": 4.296875, "grad_norm": 0.05108823999762535, "learning_rate": 5.706825657894738e-06, "loss": 0.123, "step": 10450 }, { "epoch": 4.300986842105263, "grad_norm": 24.811241149902344, "learning_rate": 5.702713815789475e-06, "loss": 0.0748, "step": 10460 }, { "epoch": 4.3050986842105265, "grad_norm": 1.4945974349975586, "learning_rate": 5.698601973684211e-06, "loss": 0.0022, "step": 10470 }, { "epoch": 4.309210526315789, "grad_norm": 0.003993542864918709, "learning_rate": 5.694490131578948e-06, "loss": 0.0409, "step": 10480 }, { "epoch": 4.313322368421053, "grad_norm": 0.3219471573829651, "learning_rate": 5.690378289473685e-06, "loss": 0.1386, "step": 10490 }, { "epoch": 4.317434210526316, "grad_norm": 15.706138610839844, "learning_rate": 5.686266447368421e-06, "loss": 0.0043, "step": 10500 }, { "epoch": 4.321546052631579, "grad_norm": 26.479265213012695, "learning_rate": 5.682154605263158e-06, "loss": 0.1416, "step": 10510 }, { "epoch": 4.3256578947368425, "grad_norm": 0.08380565047264099, "learning_rate": 5.678042763157895e-06, "loss": 0.0611, "step": 10520 }, { "epoch": 4.329769736842105, "grad_norm": 0.5438801050186157, "learning_rate": 5.673930921052632e-06, "loss": 0.2394, "step": 10530 }, { "epoch": 4.333881578947368, "grad_norm": 83.95884704589844, "learning_rate": 5.6698190789473685e-06, "loss": 0.343, "step": 10540 }, { "epoch": 4.337993421052632, "grad_norm": 68.3953857421875, "learning_rate": 5.665707236842106e-06, "loss": 0.0774, "step": 10550 }, { "epoch": 4.342105263157895, "grad_norm": 0.2621876299381256, "learning_rate": 5.661595394736843e-06, "loss": 0.0171, "step": 10560 }, { "epoch": 4.3462171052631575, "grad_norm": 3.258847951889038, "learning_rate": 5.65748355263158e-06, "loss": 0.0275, "step": 10570 }, { "epoch": 4.350328947368421, "grad_norm": 52.767311096191406, "learning_rate": 5.653371710526317e-06, "loss": 0.0406, "step": 10580 }, { "epoch": 4.354440789473684, "grad_norm": 0.07652966678142548, "learning_rate": 5.649259868421053e-06, "loss": 0.1451, "step": 10590 }, { "epoch": 4.358552631578947, "grad_norm": 0.0008912061457522213, "learning_rate": 5.64514802631579e-06, "loss": 0.007, "step": 10600 }, { "epoch": 4.362664473684211, "grad_norm": 0.0011371123837307096, "learning_rate": 5.6410361842105265e-06, "loss": 0.0695, "step": 10610 }, { "epoch": 4.3667763157894735, "grad_norm": 1.0595893859863281, "learning_rate": 5.636924342105263e-06, "loss": 0.0103, "step": 10620 }, { "epoch": 4.370888157894737, "grad_norm": 34.24848937988281, "learning_rate": 5.6328125e-06, "loss": 0.5059, "step": 10630 }, { "epoch": 4.375, "grad_norm": 0.004730758722871542, "learning_rate": 5.628700657894738e-06, "loss": 0.0572, "step": 10640 }, { "epoch": 4.379111842105263, "grad_norm": 0.024553537368774414, "learning_rate": 5.624588815789475e-06, "loss": 0.0008, "step": 10650 }, { "epoch": 4.3832236842105265, "grad_norm": 0.03798150643706322, "learning_rate": 5.620476973684211e-06, "loss": 0.1627, "step": 10660 }, { "epoch": 4.387335526315789, "grad_norm": 1.5321662425994873, "learning_rate": 5.616365131578948e-06, "loss": 0.266, "step": 10670 }, { "epoch": 4.391447368421053, "grad_norm": 0.02827838808298111, "learning_rate": 5.6122532894736844e-06, "loss": 0.0043, "step": 10680 }, { "epoch": 4.395559210526316, "grad_norm": 0.8193231225013733, "learning_rate": 5.608141447368422e-06, "loss": 0.1079, "step": 10690 }, { "epoch": 4.399671052631579, "grad_norm": 0.008944814093410969, "learning_rate": 5.6040296052631585e-06, "loss": 0.1774, "step": 10700 }, { "epoch": 4.4037828947368425, "grad_norm": 71.00315856933594, "learning_rate": 5.599917763157895e-06, "loss": 0.252, "step": 10710 }, { "epoch": 4.407894736842105, "grad_norm": 0.2633829712867737, "learning_rate": 5.595805921052632e-06, "loss": 0.2704, "step": 10720 }, { "epoch": 4.412006578947368, "grad_norm": 0.5669187903404236, "learning_rate": 5.591694078947368e-06, "loss": 0.1003, "step": 10730 }, { "epoch": 4.416118421052632, "grad_norm": 0.027173712849617004, "learning_rate": 5.587582236842107e-06, "loss": 0.079, "step": 10740 }, { "epoch": 4.420230263157895, "grad_norm": 0.06833348423242569, "learning_rate": 5.583470394736843e-06, "loss": 0.2802, "step": 10750 }, { "epoch": 4.4243421052631575, "grad_norm": 77.60299682617188, "learning_rate": 5.57935855263158e-06, "loss": 0.2194, "step": 10760 }, { "epoch": 4.428453947368421, "grad_norm": 0.24349355697631836, "learning_rate": 5.5752467105263165e-06, "loss": 0.2518, "step": 10770 }, { "epoch": 4.432565789473684, "grad_norm": 0.6897732019424438, "learning_rate": 5.571134868421053e-06, "loss": 0.3367, "step": 10780 }, { "epoch": 4.436677631578947, "grad_norm": 4.426616668701172, "learning_rate": 5.56702302631579e-06, "loss": 0.2116, "step": 10790 }, { "epoch": 4.440789473684211, "grad_norm": 0.06270567327737808, "learning_rate": 5.562911184210526e-06, "loss": 0.0551, "step": 10800 }, { "epoch": 4.4449013157894735, "grad_norm": 15.072378158569336, "learning_rate": 5.558799342105263e-06, "loss": 0.0283, "step": 10810 }, { "epoch": 4.449013157894737, "grad_norm": 0.22645829617977142, "learning_rate": 5.5546875e-06, "loss": 0.0336, "step": 10820 }, { "epoch": 4.453125, "grad_norm": 0.595355749130249, "learning_rate": 5.550575657894738e-06, "loss": 0.1071, "step": 10830 }, { "epoch": 4.457236842105263, "grad_norm": 1.1005072593688965, "learning_rate": 5.546463815789474e-06, "loss": 0.047, "step": 10840 }, { "epoch": 4.4613486842105265, "grad_norm": 121.18562316894531, "learning_rate": 5.542351973684211e-06, "loss": 0.7024, "step": 10850 }, { "epoch": 4.465460526315789, "grad_norm": 0.004475445486605167, "learning_rate": 5.538240131578948e-06, "loss": 0.0434, "step": 10860 }, { "epoch": 4.469572368421053, "grad_norm": 0.03283686935901642, "learning_rate": 5.534128289473685e-06, "loss": 0.0131, "step": 10870 }, { "epoch": 4.473684210526316, "grad_norm": 0.7433743476867676, "learning_rate": 5.530016447368422e-06, "loss": 0.0166, "step": 10880 }, { "epoch": 4.477796052631579, "grad_norm": 77.12642669677734, "learning_rate": 5.525904605263158e-06, "loss": 0.1083, "step": 10890 }, { "epoch": 4.4819078947368425, "grad_norm": 0.37571632862091064, "learning_rate": 5.521792763157895e-06, "loss": 0.0452, "step": 10900 }, { "epoch": 4.486019736842105, "grad_norm": 0.03792794793844223, "learning_rate": 5.5176809210526315e-06, "loss": 0.0042, "step": 10910 }, { "epoch": 4.490131578947368, "grad_norm": 0.017094863578677177, "learning_rate": 5.513569078947368e-06, "loss": 0.1781, "step": 10920 }, { "epoch": 4.494243421052632, "grad_norm": 9.493055404163897e-05, "learning_rate": 5.509457236842106e-06, "loss": 0.5008, "step": 10930 }, { "epoch": 4.498355263157895, "grad_norm": 1.660346508026123, "learning_rate": 5.505345394736843e-06, "loss": 0.0303, "step": 10940 }, { "epoch": 4.5024671052631575, "grad_norm": 0.02450186386704445, "learning_rate": 5.50123355263158e-06, "loss": 0.0511, "step": 10950 }, { "epoch": 4.506578947368421, "grad_norm": 0.11601614207029343, "learning_rate": 5.497121710526316e-06, "loss": 0.0899, "step": 10960 }, { "epoch": 4.510690789473684, "grad_norm": 4.531919002532959, "learning_rate": 5.493009868421053e-06, "loss": 0.0918, "step": 10970 }, { "epoch": 4.514802631578947, "grad_norm": 8.091043472290039, "learning_rate": 5.4888980263157895e-06, "loss": 0.187, "step": 10980 }, { "epoch": 4.518914473684211, "grad_norm": 51.77922821044922, "learning_rate": 5.484786184210527e-06, "loss": 0.0883, "step": 10990 }, { "epoch": 4.5230263157894735, "grad_norm": 0.0011709548998624086, "learning_rate": 5.4806743421052635e-06, "loss": 0.183, "step": 11000 }, { "epoch": 4.527138157894737, "grad_norm": 75.65191650390625, "learning_rate": 5.4765625e-06, "loss": 0.5483, "step": 11010 }, { "epoch": 4.53125, "grad_norm": 0.31503334641456604, "learning_rate": 5.472450657894738e-06, "loss": 0.021, "step": 11020 }, { "epoch": 4.535361842105263, "grad_norm": 0.0058042751625180244, "learning_rate": 5.468338815789474e-06, "loss": 0.0388, "step": 11030 }, { "epoch": 4.5394736842105265, "grad_norm": 0.70481938123703, "learning_rate": 5.464226973684212e-06, "loss": 0.1086, "step": 11040 }, { "epoch": 4.543585526315789, "grad_norm": 6.683977127075195, "learning_rate": 5.460115131578948e-06, "loss": 0.1748, "step": 11050 }, { "epoch": 4.547697368421053, "grad_norm": 0.8053579926490784, "learning_rate": 5.456003289473685e-06, "loss": 0.2015, "step": 11060 }, { "epoch": 4.551809210526316, "grad_norm": 0.025869661942124367, "learning_rate": 5.4518914473684215e-06, "loss": 0.0331, "step": 11070 }, { "epoch": 4.555921052631579, "grad_norm": 11.96985912322998, "learning_rate": 5.447779605263158e-06, "loss": 0.1065, "step": 11080 }, { "epoch": 4.5600328947368425, "grad_norm": 0.0028174128383398056, "learning_rate": 5.443667763157895e-06, "loss": 0.0375, "step": 11090 }, { "epoch": 4.564144736842105, "grad_norm": 85.98474884033203, "learning_rate": 5.439555921052631e-06, "loss": 0.14, "step": 11100 }, { "epoch": 4.568256578947368, "grad_norm": 7.298089981079102, "learning_rate": 5.435444078947369e-06, "loss": 0.1029, "step": 11110 }, { "epoch": 4.572368421052632, "grad_norm": 0.0005351307918317616, "learning_rate": 5.431332236842106e-06, "loss": 0.2296, "step": 11120 }, { "epoch": 4.576480263157895, "grad_norm": 0.5227247476577759, "learning_rate": 5.427220394736843e-06, "loss": 0.0584, "step": 11130 }, { "epoch": 4.5805921052631575, "grad_norm": 0.37593796849250793, "learning_rate": 5.4231085526315794e-06, "loss": 0.0093, "step": 11140 }, { "epoch": 4.584703947368421, "grad_norm": 0.3121981918811798, "learning_rate": 5.418996710526316e-06, "loss": 0.0211, "step": 11150 }, { "epoch": 4.588815789473684, "grad_norm": 0.44901901483535767, "learning_rate": 5.4148848684210535e-06, "loss": 0.0044, "step": 11160 }, { "epoch": 4.592927631578947, "grad_norm": 34.624664306640625, "learning_rate": 5.41077302631579e-06, "loss": 0.1367, "step": 11170 }, { "epoch": 4.597039473684211, "grad_norm": 17.65924835205078, "learning_rate": 5.406661184210527e-06, "loss": 0.419, "step": 11180 }, { "epoch": 4.6011513157894735, "grad_norm": 0.12152664363384247, "learning_rate": 5.402549342105263e-06, "loss": 0.0223, "step": 11190 }, { "epoch": 4.605263157894737, "grad_norm": 0.002171924104914069, "learning_rate": 5.3984375e-06, "loss": 0.0406, "step": 11200 }, { "epoch": 4.609375, "grad_norm": 0.030598795041441917, "learning_rate": 5.394325657894738e-06, "loss": 0.0308, "step": 11210 }, { "epoch": 4.613486842105263, "grad_norm": 116.11152648925781, "learning_rate": 5.390213815789475e-06, "loss": 0.1755, "step": 11220 }, { "epoch": 4.6175986842105265, "grad_norm": 0.0010453220456838608, "learning_rate": 5.3861019736842114e-06, "loss": 0.4576, "step": 11230 }, { "epoch": 4.621710526315789, "grad_norm": 0.016688738018274307, "learning_rate": 5.381990131578948e-06, "loss": 0.2701, "step": 11240 }, { "epoch": 4.625822368421053, "grad_norm": 0.0017963089048862457, "learning_rate": 5.377878289473685e-06, "loss": 0.0009, "step": 11250 }, { "epoch": 4.629934210526316, "grad_norm": 0.00248286803252995, "learning_rate": 5.373766447368421e-06, "loss": 0.2538, "step": 11260 }, { "epoch": 4.634046052631579, "grad_norm": 0.01858946494758129, "learning_rate": 5.369654605263158e-06, "loss": 0.5668, "step": 11270 }, { "epoch": 4.6381578947368425, "grad_norm": 84.9400634765625, "learning_rate": 5.3655427631578945e-06, "loss": 0.1191, "step": 11280 }, { "epoch": 4.642269736842105, "grad_norm": 46.583229064941406, "learning_rate": 5.361430921052632e-06, "loss": 0.0579, "step": 11290 }, { "epoch": 4.646381578947368, "grad_norm": 3.9586727619171143, "learning_rate": 5.3573190789473686e-06, "loss": 0.1239, "step": 11300 }, { "epoch": 4.650493421052632, "grad_norm": 1.132432222366333, "learning_rate": 5.353207236842106e-06, "loss": 0.1484, "step": 11310 }, { "epoch": 4.654605263157895, "grad_norm": 2.125668525695801, "learning_rate": 5.349095394736843e-06, "loss": 0.412, "step": 11320 }, { "epoch": 4.6587171052631575, "grad_norm": 0.03645682334899902, "learning_rate": 5.344983552631579e-06, "loss": 0.0031, "step": 11330 }, { "epoch": 4.662828947368421, "grad_norm": 81.14274597167969, "learning_rate": 5.340871710526317e-06, "loss": 0.0924, "step": 11340 }, { "epoch": 4.666940789473684, "grad_norm": 0.13125614821910858, "learning_rate": 5.336759868421053e-06, "loss": 0.4672, "step": 11350 }, { "epoch": 4.671052631578947, "grad_norm": 0.0066694519482553005, "learning_rate": 5.33264802631579e-06, "loss": 0.0737, "step": 11360 }, { "epoch": 4.675164473684211, "grad_norm": 0.05095091834664345, "learning_rate": 5.3285361842105265e-06, "loss": 0.2657, "step": 11370 }, { "epoch": 4.6792763157894735, "grad_norm": 0.019465183839201927, "learning_rate": 5.324424342105263e-06, "loss": 0.0345, "step": 11380 }, { "epoch": 4.683388157894737, "grad_norm": 0.2005145102739334, "learning_rate": 5.3203125e-06, "loss": 0.0541, "step": 11390 }, { "epoch": 4.6875, "grad_norm": 2.549844264984131, "learning_rate": 5.316200657894738e-06, "loss": 0.2792, "step": 11400 }, { "epoch": 4.691611842105263, "grad_norm": 9.314865112304688, "learning_rate": 5.312088815789475e-06, "loss": 0.0775, "step": 11410 }, { "epoch": 4.6957236842105265, "grad_norm": 0.14117684960365295, "learning_rate": 5.307976973684211e-06, "loss": 0.295, "step": 11420 }, { "epoch": 4.699835526315789, "grad_norm": 0.00023204373428598046, "learning_rate": 5.303865131578948e-06, "loss": 0.1064, "step": 11430 }, { "epoch": 4.703947368421053, "grad_norm": 0.059099286794662476, "learning_rate": 5.2997532894736845e-06, "loss": 0.1719, "step": 11440 }, { "epoch": 4.708059210526316, "grad_norm": 1.6177492141723633, "learning_rate": 5.295641447368421e-06, "loss": 0.0534, "step": 11450 }, { "epoch": 4.712171052631579, "grad_norm": 0.029669273644685745, "learning_rate": 5.2915296052631585e-06, "loss": 0.0223, "step": 11460 }, { "epoch": 4.7162828947368425, "grad_norm": 0.011819909326732159, "learning_rate": 5.287417763157895e-06, "loss": 0.1184, "step": 11470 }, { "epoch": 4.720394736842105, "grad_norm": 89.91704559326172, "learning_rate": 5.283305921052632e-06, "loss": 0.4615, "step": 11480 }, { "epoch": 4.724506578947368, "grad_norm": 0.08650683611631393, "learning_rate": 5.279194078947368e-06, "loss": 0.0019, "step": 11490 }, { "epoch": 4.728618421052632, "grad_norm": 0.15594571828842163, "learning_rate": 5.275082236842106e-06, "loss": 0.0339, "step": 11500 }, { "epoch": 4.732730263157895, "grad_norm": 1.9714744091033936, "learning_rate": 5.270970394736843e-06, "loss": 0.2982, "step": 11510 }, { "epoch": 4.7368421052631575, "grad_norm": 0.053652070462703705, "learning_rate": 5.26685855263158e-06, "loss": 0.1086, "step": 11520 }, { "epoch": 4.740953947368421, "grad_norm": 2.379514217376709, "learning_rate": 5.2627467105263165e-06, "loss": 0.0506, "step": 11530 }, { "epoch": 4.745065789473684, "grad_norm": 0.000377872318495065, "learning_rate": 5.258634868421053e-06, "loss": 0.0937, "step": 11540 }, { "epoch": 4.749177631578947, "grad_norm": 1.3539845943450928, "learning_rate": 5.25452302631579e-06, "loss": 0.0948, "step": 11550 }, { "epoch": 4.753289473684211, "grad_norm": 1.4154525995254517, "learning_rate": 5.250411184210526e-06, "loss": 0.1704, "step": 11560 }, { "epoch": 4.7574013157894735, "grad_norm": 0.5577759146690369, "learning_rate": 5.246299342105263e-06, "loss": 0.0408, "step": 11570 }, { "epoch": 4.761513157894737, "grad_norm": 74.89480590820312, "learning_rate": 5.2421875e-06, "loss": 0.3608, "step": 11580 }, { "epoch": 4.765625, "grad_norm": 5.859063625335693, "learning_rate": 5.238075657894738e-06, "loss": 0.2313, "step": 11590 }, { "epoch": 4.769736842105263, "grad_norm": 0.0718579813838005, "learning_rate": 5.2339638157894744e-06, "loss": 0.0054, "step": 11600 }, { "epoch": 4.7738486842105265, "grad_norm": 0.0002502045244909823, "learning_rate": 5.229851973684211e-06, "loss": 0.0367, "step": 11610 }, { "epoch": 4.777960526315789, "grad_norm": 0.042701832950115204, "learning_rate": 5.225740131578948e-06, "loss": 0.2447, "step": 11620 }, { "epoch": 4.782072368421053, "grad_norm": 0.010373505763709545, "learning_rate": 5.221628289473685e-06, "loss": 0.2749, "step": 11630 }, { "epoch": 4.786184210526316, "grad_norm": 0.005021689459681511, "learning_rate": 5.217516447368422e-06, "loss": 0.0291, "step": 11640 }, { "epoch": 4.790296052631579, "grad_norm": 0.009168042801320553, "learning_rate": 5.213404605263158e-06, "loss": 0.012, "step": 11650 }, { "epoch": 4.7944078947368425, "grad_norm": 0.08961399644613266, "learning_rate": 5.209292763157895e-06, "loss": 0.0174, "step": 11660 }, { "epoch": 4.798519736842105, "grad_norm": 6.835995674133301, "learning_rate": 5.2051809210526315e-06, "loss": 0.306, "step": 11670 }, { "epoch": 4.802631578947368, "grad_norm": 110.7245101928711, "learning_rate": 5.201069078947368e-06, "loss": 0.7956, "step": 11680 }, { "epoch": 4.806743421052632, "grad_norm": 85.60328674316406, "learning_rate": 5.1969572368421064e-06, "loss": 0.1353, "step": 11690 }, { "epoch": 4.810855263157895, "grad_norm": 8.091873168945312, "learning_rate": 5.192845394736843e-06, "loss": 0.0128, "step": 11700 }, { "epoch": 4.8149671052631575, "grad_norm": 83.81761932373047, "learning_rate": 5.18873355263158e-06, "loss": 0.3359, "step": 11710 }, { "epoch": 4.819078947368421, "grad_norm": 6.044814109802246, "learning_rate": 5.184621710526316e-06, "loss": 0.1492, "step": 11720 }, { "epoch": 4.823190789473684, "grad_norm": 0.00032899685902521014, "learning_rate": 5.180509868421053e-06, "loss": 0.1777, "step": 11730 }, { "epoch": 4.827302631578947, "grad_norm": 0.09665809571743011, "learning_rate": 5.1763980263157895e-06, "loss": 0.0136, "step": 11740 }, { "epoch": 4.831414473684211, "grad_norm": 0.19958020746707916, "learning_rate": 5.172286184210527e-06, "loss": 0.234, "step": 11750 }, { "epoch": 4.8355263157894735, "grad_norm": 0.044992148876190186, "learning_rate": 5.1681743421052636e-06, "loss": 0.0143, "step": 11760 }, { "epoch": 4.839638157894737, "grad_norm": 0.6200563311576843, "learning_rate": 5.1640625e-06, "loss": 0.1882, "step": 11770 }, { "epoch": 4.84375, "grad_norm": 0.04625256359577179, "learning_rate": 5.159950657894738e-06, "loss": 0.0057, "step": 11780 }, { "epoch": 4.847861842105263, "grad_norm": 0.7069116234779358, "learning_rate": 5.155838815789474e-06, "loss": 0.3012, "step": 11790 }, { "epoch": 4.8519736842105265, "grad_norm": 0.0004912884323857725, "learning_rate": 5.151726973684212e-06, "loss": 0.0003, "step": 11800 }, { "epoch": 4.856085526315789, "grad_norm": 0.00741523178294301, "learning_rate": 5.147615131578948e-06, "loss": 0.1208, "step": 11810 }, { "epoch": 4.860197368421053, "grad_norm": 34.17157745361328, "learning_rate": 5.143503289473685e-06, "loss": 0.1658, "step": 11820 }, { "epoch": 4.864309210526316, "grad_norm": 43.198394775390625, "learning_rate": 5.1393914473684215e-06, "loss": 0.1518, "step": 11830 }, { "epoch": 4.868421052631579, "grad_norm": 0.00559881329536438, "learning_rate": 5.135279605263158e-06, "loss": 0.1322, "step": 11840 }, { "epoch": 4.8725328947368425, "grad_norm": 0.061246518045663834, "learning_rate": 5.131167763157895e-06, "loss": 0.1169, "step": 11850 }, { "epoch": 4.876644736842105, "grad_norm": 2.1810858249664307, "learning_rate": 5.127055921052631e-06, "loss": 0.0755, "step": 11860 }, { "epoch": 4.880756578947368, "grad_norm": 0.13748008012771606, "learning_rate": 5.122944078947368e-06, "loss": 0.0136, "step": 11870 }, { "epoch": 4.884868421052632, "grad_norm": 13.2427339553833, "learning_rate": 5.118832236842106e-06, "loss": 0.0096, "step": 11880 }, { "epoch": 4.888980263157895, "grad_norm": 0.20603924989700317, "learning_rate": 5.114720394736843e-06, "loss": 0.0793, "step": 11890 }, { "epoch": 4.8930921052631575, "grad_norm": 56.690826416015625, "learning_rate": 5.1106085526315795e-06, "loss": 0.3857, "step": 11900 }, { "epoch": 4.897203947368421, "grad_norm": 5.994784832000732, "learning_rate": 5.106496710526316e-06, "loss": 0.0047, "step": 11910 }, { "epoch": 4.901315789473684, "grad_norm": 0.028743823990225792, "learning_rate": 5.102384868421053e-06, "loss": 0.1121, "step": 11920 }, { "epoch": 4.905427631578947, "grad_norm": 0.002641650615260005, "learning_rate": 5.09827302631579e-06, "loss": 0.0928, "step": 11930 }, { "epoch": 4.909539473684211, "grad_norm": 0.11559925973415375, "learning_rate": 5.094161184210527e-06, "loss": 0.0364, "step": 11940 }, { "epoch": 4.9136513157894735, "grad_norm": 0.013286828063428402, "learning_rate": 5.090049342105263e-06, "loss": 0.2054, "step": 11950 }, { "epoch": 4.917763157894737, "grad_norm": 0.14687436819076538, "learning_rate": 5.0859375e-06, "loss": 0.2326, "step": 11960 }, { "epoch": 4.921875, "grad_norm": 0.004031555261462927, "learning_rate": 5.081825657894737e-06, "loss": 0.0089, "step": 11970 }, { "epoch": 4.925986842105263, "grad_norm": 0.21741409599781036, "learning_rate": 5.077713815789475e-06, "loss": 0.2988, "step": 11980 }, { "epoch": 4.9300986842105265, "grad_norm": 0.027137205004692078, "learning_rate": 5.0736019736842115e-06, "loss": 0.208, "step": 11990 }, { "epoch": 4.934210526315789, "grad_norm": 40.1978645324707, "learning_rate": 5.069490131578948e-06, "loss": 0.0322, "step": 12000 }, { "epoch": 4.938322368421053, "grad_norm": 84.67223358154297, "learning_rate": 5.065378289473685e-06, "loss": 0.2434, "step": 12010 }, { "epoch": 4.942434210526316, "grad_norm": 0.4514327347278595, "learning_rate": 5.061266447368421e-06, "loss": 0.23, "step": 12020 }, { "epoch": 4.946546052631579, "grad_norm": 19.516311645507812, "learning_rate": 5.057154605263158e-06, "loss": 0.1211, "step": 12030 }, { "epoch": 4.9506578947368425, "grad_norm": 69.44686126708984, "learning_rate": 5.0530427631578945e-06, "loss": 0.0526, "step": 12040 }, { "epoch": 4.954769736842105, "grad_norm": 0.7357942461967468, "learning_rate": 5.048930921052632e-06, "loss": 0.096, "step": 12050 }, { "epoch": 4.958881578947368, "grad_norm": 55.00428009033203, "learning_rate": 5.044819078947369e-06, "loss": 0.1763, "step": 12060 }, { "epoch": 4.962993421052632, "grad_norm": 0.006061846856027842, "learning_rate": 5.040707236842106e-06, "loss": 0.2433, "step": 12070 }, { "epoch": 4.967105263157895, "grad_norm": 0.01570850983262062, "learning_rate": 5.036595394736843e-06, "loss": 0.005, "step": 12080 }, { "epoch": 4.9712171052631575, "grad_norm": 0.002090919530019164, "learning_rate": 5.032483552631579e-06, "loss": 0.0092, "step": 12090 }, { "epoch": 4.975328947368421, "grad_norm": 0.02324344776570797, "learning_rate": 5.028371710526317e-06, "loss": 0.2655, "step": 12100 }, { "epoch": 4.979440789473684, "grad_norm": 0.023654013872146606, "learning_rate": 5.024259868421053e-06, "loss": 0.1504, "step": 12110 }, { "epoch": 4.983552631578947, "grad_norm": 70.16049194335938, "learning_rate": 5.02014802631579e-06, "loss": 0.3557, "step": 12120 }, { "epoch": 4.987664473684211, "grad_norm": 0.004474622197449207, "learning_rate": 5.0160361842105265e-06, "loss": 0.266, "step": 12130 }, { "epoch": 4.9917763157894735, "grad_norm": 0.035371605306863785, "learning_rate": 5.011924342105263e-06, "loss": 0.0795, "step": 12140 }, { "epoch": 4.995888157894737, "grad_norm": 0.007857566699385643, "learning_rate": 5.0078125e-06, "loss": 0.0035, "step": 12150 }, { "epoch": 5.0, "grad_norm": 118.69224548339844, "learning_rate": 5.003700657894738e-06, "loss": 0.1007, "step": 12160 }, { "epoch": 5.004111842105263, "grad_norm": 3.55021071434021, "learning_rate": 4.999588815789474e-06, "loss": 0.09, "step": 12170 }, { "epoch": 5.0082236842105265, "grad_norm": 2.0292088985443115, "learning_rate": 4.9954769736842104e-06, "loss": 0.051, "step": 12180 }, { "epoch": 5.012335526315789, "grad_norm": 0.41962769627571106, "learning_rate": 4.991365131578948e-06, "loss": 0.1036, "step": 12190 }, { "epoch": 5.016447368421052, "grad_norm": 0.009977146051824093, "learning_rate": 4.9872532894736845e-06, "loss": 0.0087, "step": 12200 }, { "epoch": 5.020559210526316, "grad_norm": 0.0003735716745723039, "learning_rate": 4.983141447368421e-06, "loss": 0.0022, "step": 12210 }, { "epoch": 5.024671052631579, "grad_norm": 16.074596405029297, "learning_rate": 4.9790296052631586e-06, "loss": 0.0139, "step": 12220 }, { "epoch": 5.0287828947368425, "grad_norm": 3.9828500747680664, "learning_rate": 4.974917763157895e-06, "loss": 0.3339, "step": 12230 }, { "epoch": 5.032894736842105, "grad_norm": 54.579383850097656, "learning_rate": 4.970805921052632e-06, "loss": 0.0736, "step": 12240 }, { "epoch": 5.037006578947368, "grad_norm": 0.0028906783554702997, "learning_rate": 4.966694078947369e-06, "loss": 0.0565, "step": 12250 }, { "epoch": 5.041118421052632, "grad_norm": 0.09334980696439743, "learning_rate": 4.962582236842106e-06, "loss": 0.0103, "step": 12260 }, { "epoch": 5.045230263157895, "grad_norm": 0.44651469588279724, "learning_rate": 4.9584703947368424e-06, "loss": 0.0421, "step": 12270 }, { "epoch": 5.0493421052631575, "grad_norm": 0.3897380530834198, "learning_rate": 4.95435855263158e-06, "loss": 0.0438, "step": 12280 }, { "epoch": 5.053453947368421, "grad_norm": 0.2058589607477188, "learning_rate": 4.9502467105263165e-06, "loss": 0.0285, "step": 12290 }, { "epoch": 5.057565789473684, "grad_norm": 7.561193466186523, "learning_rate": 4.946134868421053e-06, "loss": 0.0042, "step": 12300 }, { "epoch": 5.061677631578948, "grad_norm": 47.261566162109375, "learning_rate": 4.94202302631579e-06, "loss": 0.2775, "step": 12310 }, { "epoch": 5.065789473684211, "grad_norm": 45.68752670288086, "learning_rate": 4.937911184210526e-06, "loss": 0.1912, "step": 12320 }, { "epoch": 5.0699013157894735, "grad_norm": 0.008435727097094059, "learning_rate": 4.933799342105264e-06, "loss": 0.0739, "step": 12330 }, { "epoch": 5.074013157894737, "grad_norm": 0.40947651863098145, "learning_rate": 4.9296875e-06, "loss": 0.1447, "step": 12340 }, { "epoch": 5.078125, "grad_norm": 0.009739860892295837, "learning_rate": 4.925575657894737e-06, "loss": 0.0158, "step": 12350 }, { "epoch": 5.082236842105263, "grad_norm": 0.007655106019228697, "learning_rate": 4.921463815789474e-06, "loss": 0.0416, "step": 12360 }, { "epoch": 5.0863486842105265, "grad_norm": 3.085749387741089, "learning_rate": 4.917351973684211e-06, "loss": 0.0112, "step": 12370 }, { "epoch": 5.090460526315789, "grad_norm": 0.8423932194709778, "learning_rate": 4.913240131578948e-06, "loss": 0.0338, "step": 12380 }, { "epoch": 5.094572368421052, "grad_norm": 0.0011507633607834578, "learning_rate": 4.909128289473684e-06, "loss": 0.0024, "step": 12390 }, { "epoch": 5.098684210526316, "grad_norm": 0.06639353185892105, "learning_rate": 4.905016447368422e-06, "loss": 0.1583, "step": 12400 }, { "epoch": 5.102796052631579, "grad_norm": 0.28097525238990784, "learning_rate": 4.900904605263158e-06, "loss": 0.0059, "step": 12410 }, { "epoch": 5.1069078947368425, "grad_norm": 0.0003991532139480114, "learning_rate": 4.896792763157895e-06, "loss": 0.0997, "step": 12420 }, { "epoch": 5.111019736842105, "grad_norm": 0.10429614037275314, "learning_rate": 4.892680921052632e-06, "loss": 0.1612, "step": 12430 }, { "epoch": 5.115131578947368, "grad_norm": 1.887984037399292, "learning_rate": 4.888569078947369e-06, "loss": 0.0778, "step": 12440 }, { "epoch": 5.119243421052632, "grad_norm": 1.0391831398010254, "learning_rate": 4.884457236842106e-06, "loss": 0.0022, "step": 12450 }, { "epoch": 5.123355263157895, "grad_norm": 0.005739231593906879, "learning_rate": 4.880345394736842e-06, "loss": 0.2004, "step": 12460 }, { "epoch": 5.1274671052631575, "grad_norm": 0.3580719828605652, "learning_rate": 4.87623355263158e-06, "loss": 0.0694, "step": 12470 }, { "epoch": 5.131578947368421, "grad_norm": 0.00015461303701158613, "learning_rate": 4.872121710526316e-06, "loss": 0.1299, "step": 12480 }, { "epoch": 5.135690789473684, "grad_norm": 0.012092826887965202, "learning_rate": 4.868009868421053e-06, "loss": 0.1834, "step": 12490 }, { "epoch": 5.139802631578948, "grad_norm": 0.019745737314224243, "learning_rate": 4.8638980263157895e-06, "loss": 0.0675, "step": 12500 }, { "epoch": 5.143914473684211, "grad_norm": 9.999950408935547, "learning_rate": 4.859786184210526e-06, "loss": 0.0185, "step": 12510 }, { "epoch": 5.1480263157894735, "grad_norm": 4.208477973937988, "learning_rate": 4.855674342105264e-06, "loss": 0.116, "step": 12520 }, { "epoch": 5.152138157894737, "grad_norm": 0.0007267239852808416, "learning_rate": 4.8515625e-06, "loss": 0.0604, "step": 12530 }, { "epoch": 5.15625, "grad_norm": 0.0007356388377957046, "learning_rate": 4.847450657894738e-06, "loss": 0.0038, "step": 12540 }, { "epoch": 5.160361842105263, "grad_norm": 76.24391174316406, "learning_rate": 4.843338815789474e-06, "loss": 0.264, "step": 12550 }, { "epoch": 5.1644736842105265, "grad_norm": 0.02224615029990673, "learning_rate": 4.839226973684211e-06, "loss": 0.0107, "step": 12560 }, { "epoch": 5.168585526315789, "grad_norm": 0.872387707233429, "learning_rate": 4.835115131578948e-06, "loss": 0.0017, "step": 12570 }, { "epoch": 5.172697368421052, "grad_norm": 0.055020976811647415, "learning_rate": 4.831003289473685e-06, "loss": 0.1254, "step": 12580 }, { "epoch": 5.176809210526316, "grad_norm": 0.028826193884015083, "learning_rate": 4.8268914473684215e-06, "loss": 0.0145, "step": 12590 }, { "epoch": 5.180921052631579, "grad_norm": 0.12968547642230988, "learning_rate": 4.822779605263158e-06, "loss": 0.0151, "step": 12600 }, { "epoch": 5.1850328947368425, "grad_norm": 0.10976220667362213, "learning_rate": 4.818667763157895e-06, "loss": 0.0207, "step": 12610 }, { "epoch": 5.189144736842105, "grad_norm": 0.003534722840413451, "learning_rate": 4.814555921052632e-06, "loss": 0.0085, "step": 12620 }, { "epoch": 5.193256578947368, "grad_norm": 0.008794968947768211, "learning_rate": 4.810444078947369e-06, "loss": 0.125, "step": 12630 }, { "epoch": 5.197368421052632, "grad_norm": 1.5557653903961182, "learning_rate": 4.806332236842105e-06, "loss": 0.0224, "step": 12640 }, { "epoch": 5.201480263157895, "grad_norm": 0.010107816196978092, "learning_rate": 4.802220394736842e-06, "loss": 0.1335, "step": 12650 }, { "epoch": 5.2055921052631575, "grad_norm": 4.243147373199463, "learning_rate": 4.7981085526315795e-06, "loss": 0.0065, "step": 12660 }, { "epoch": 5.209703947368421, "grad_norm": 0.33208709955215454, "learning_rate": 4.793996710526316e-06, "loss": 0.1848, "step": 12670 }, { "epoch": 5.213815789473684, "grad_norm": 4.119780540466309, "learning_rate": 4.789884868421053e-06, "loss": 0.0587, "step": 12680 }, { "epoch": 5.217927631578948, "grad_norm": 115.96353912353516, "learning_rate": 4.78577302631579e-06, "loss": 0.4392, "step": 12690 }, { "epoch": 5.222039473684211, "grad_norm": 0.0005382675444707274, "learning_rate": 4.781661184210527e-06, "loss": 0.0101, "step": 12700 }, { "epoch": 5.2261513157894735, "grad_norm": 0.5530701279640198, "learning_rate": 4.777549342105263e-06, "loss": 0.0904, "step": 12710 }, { "epoch": 5.230263157894737, "grad_norm": 86.44254302978516, "learning_rate": 4.773437500000001e-06, "loss": 0.135, "step": 12720 }, { "epoch": 5.234375, "grad_norm": 2.6025938987731934, "learning_rate": 4.7693256578947374e-06, "loss": 0.0513, "step": 12730 }, { "epoch": 5.238486842105263, "grad_norm": 0.04528487101197243, "learning_rate": 4.765213815789474e-06, "loss": 0.1211, "step": 12740 }, { "epoch": 5.2425986842105265, "grad_norm": 0.049920398741960526, "learning_rate": 4.761101973684211e-06, "loss": 0.2927, "step": 12750 }, { "epoch": 5.246710526315789, "grad_norm": 0.231965571641922, "learning_rate": 4.756990131578948e-06, "loss": 0.3086, "step": 12760 }, { "epoch": 5.250822368421053, "grad_norm": 0.4021833539009094, "learning_rate": 4.752878289473685e-06, "loss": 0.1136, "step": 12770 }, { "epoch": 5.254934210526316, "grad_norm": 79.92125701904297, "learning_rate": 4.748766447368421e-06, "loss": 0.08, "step": 12780 }, { "epoch": 5.259046052631579, "grad_norm": 0.0020701682660728693, "learning_rate": 4.744654605263158e-06, "loss": 0.0191, "step": 12790 }, { "epoch": 5.2631578947368425, "grad_norm": 0.015794718638062477, "learning_rate": 4.7405427631578945e-06, "loss": 0.0095, "step": 12800 }, { "epoch": 5.267269736842105, "grad_norm": 0.31466972827911377, "learning_rate": 4.736430921052632e-06, "loss": 0.0093, "step": 12810 }, { "epoch": 5.271381578947368, "grad_norm": 0.1875465214252472, "learning_rate": 4.732319078947369e-06, "loss": 0.1396, "step": 12820 }, { "epoch": 5.275493421052632, "grad_norm": 12.725687026977539, "learning_rate": 4.728207236842105e-06, "loss": 0.1678, "step": 12830 }, { "epoch": 5.279605263157895, "grad_norm": 0.07018950581550598, "learning_rate": 4.724095394736843e-06, "loss": 0.083, "step": 12840 }, { "epoch": 5.2837171052631575, "grad_norm": 7.318544864654541, "learning_rate": 4.719983552631579e-06, "loss": 0.0284, "step": 12850 }, { "epoch": 5.287828947368421, "grad_norm": 2.0364904403686523, "learning_rate": 4.715871710526317e-06, "loss": 0.1169, "step": 12860 }, { "epoch": 5.291940789473684, "grad_norm": 0.018038108944892883, "learning_rate": 4.711759868421053e-06, "loss": 0.0011, "step": 12870 }, { "epoch": 5.296052631578947, "grad_norm": 0.15174317359924316, "learning_rate": 4.70764802631579e-06, "loss": 0.0515, "step": 12880 }, { "epoch": 5.300164473684211, "grad_norm": 0.06006835028529167, "learning_rate": 4.7035361842105266e-06, "loss": 0.0028, "step": 12890 }, { "epoch": 5.3042763157894735, "grad_norm": 0.21908903121948242, "learning_rate": 4.699424342105264e-06, "loss": 0.0261, "step": 12900 }, { "epoch": 5.308388157894737, "grad_norm": 3.5568766593933105, "learning_rate": 4.695312500000001e-06, "loss": 0.3875, "step": 12910 }, { "epoch": 5.3125, "grad_norm": 0.43564674258232117, "learning_rate": 4.691200657894737e-06, "loss": 0.0769, "step": 12920 }, { "epoch": 5.316611842105263, "grad_norm": 0.14090363681316376, "learning_rate": 4.687088815789474e-06, "loss": 0.035, "step": 12930 }, { "epoch": 5.3207236842105265, "grad_norm": 0.14797407388687134, "learning_rate": 4.6829769736842105e-06, "loss": 0.015, "step": 12940 }, { "epoch": 5.324835526315789, "grad_norm": 9.737833976745605, "learning_rate": 4.678865131578948e-06, "loss": 0.1553, "step": 12950 }, { "epoch": 5.328947368421053, "grad_norm": 0.6963122487068176, "learning_rate": 4.6747532894736845e-06, "loss": 0.0025, "step": 12960 }, { "epoch": 5.333059210526316, "grad_norm": 7.62711763381958, "learning_rate": 4.670641447368421e-06, "loss": 0.0051, "step": 12970 }, { "epoch": 5.337171052631579, "grad_norm": 0.015172763727605343, "learning_rate": 4.666529605263158e-06, "loss": 0.08, "step": 12980 }, { "epoch": 5.3412828947368425, "grad_norm": 0.47709551453590393, "learning_rate": 4.662417763157895e-06, "loss": 0.0008, "step": 12990 }, { "epoch": 5.345394736842105, "grad_norm": 8.82782660482917e-06, "learning_rate": 4.658305921052632e-06, "loss": 0.0129, "step": 13000 }, { "epoch": 5.349506578947368, "grad_norm": 55.73831558227539, "learning_rate": 4.654194078947369e-06, "loss": 0.1756, "step": 13010 }, { "epoch": 5.353618421052632, "grad_norm": 0.013916180469095707, "learning_rate": 4.650082236842106e-06, "loss": 0.0494, "step": 13020 }, { "epoch": 5.357730263157895, "grad_norm": 0.01592426560819149, "learning_rate": 4.6459703947368425e-06, "loss": 0.0407, "step": 13030 }, { "epoch": 5.3618421052631575, "grad_norm": 65.08041381835938, "learning_rate": 4.64185855263158e-06, "loss": 0.0859, "step": 13040 }, { "epoch": 5.365953947368421, "grad_norm": 0.08960430324077606, "learning_rate": 4.6377467105263165e-06, "loss": 0.0019, "step": 13050 }, { "epoch": 5.370065789473684, "grad_norm": 14.894381523132324, "learning_rate": 4.633634868421053e-06, "loss": 0.0202, "step": 13060 }, { "epoch": 5.374177631578947, "grad_norm": 0.005502131301909685, "learning_rate": 4.62952302631579e-06, "loss": 0.0053, "step": 13070 }, { "epoch": 5.378289473684211, "grad_norm": 21.16437339782715, "learning_rate": 4.625411184210526e-06, "loss": 0.1385, "step": 13080 }, { "epoch": 5.3824013157894735, "grad_norm": 0.05989899858832359, "learning_rate": 4.621299342105264e-06, "loss": 0.0147, "step": 13090 }, { "epoch": 5.386513157894737, "grad_norm": 0.002133729634806514, "learning_rate": 4.6171875e-06, "loss": 0.0007, "step": 13100 }, { "epoch": 5.390625, "grad_norm": 47.23904037475586, "learning_rate": 4.613075657894737e-06, "loss": 0.1596, "step": 13110 }, { "epoch": 5.394736842105263, "grad_norm": 0.6729401350021362, "learning_rate": 4.608963815789474e-06, "loss": 0.0944, "step": 13120 }, { "epoch": 5.3988486842105265, "grad_norm": 0.04317113384604454, "learning_rate": 4.60485197368421e-06, "loss": 0.2098, "step": 13130 }, { "epoch": 5.402960526315789, "grad_norm": 0.1802137941122055, "learning_rate": 4.600740131578948e-06, "loss": 0.2664, "step": 13140 }, { "epoch": 5.407072368421053, "grad_norm": 1.5689358711242676, "learning_rate": 4.596628289473684e-06, "loss": 0.2259, "step": 13150 }, { "epoch": 5.411184210526316, "grad_norm": 0.18609057366847992, "learning_rate": 4.592516447368422e-06, "loss": 0.0157, "step": 13160 }, { "epoch": 5.415296052631579, "grad_norm": 116.86294555664062, "learning_rate": 4.588404605263158e-06, "loss": 0.1214, "step": 13170 }, { "epoch": 5.4194078947368425, "grad_norm": 1.2058242559432983, "learning_rate": 4.584292763157895e-06, "loss": 0.3339, "step": 13180 }, { "epoch": 5.423519736842105, "grad_norm": 2.2884137630462646, "learning_rate": 4.5801809210526324e-06, "loss": 0.0364, "step": 13190 }, { "epoch": 5.427631578947368, "grad_norm": 109.472900390625, "learning_rate": 4.576069078947369e-06, "loss": 0.2061, "step": 13200 }, { "epoch": 5.431743421052632, "grad_norm": 83.02965545654297, "learning_rate": 4.571957236842106e-06, "loss": 0.3225, "step": 13210 }, { "epoch": 5.435855263157895, "grad_norm": 4.199236869812012, "learning_rate": 4.567845394736842e-06, "loss": 0.0084, "step": 13220 }, { "epoch": 5.4399671052631575, "grad_norm": 70.33255004882812, "learning_rate": 4.56373355263158e-06, "loss": 0.0816, "step": 13230 }, { "epoch": 5.444078947368421, "grad_norm": 0.0003307022852823138, "learning_rate": 4.559621710526316e-06, "loss": 0.0943, "step": 13240 }, { "epoch": 5.448190789473684, "grad_norm": 0.011274745687842369, "learning_rate": 4.555509868421053e-06, "loss": 0.0331, "step": 13250 }, { "epoch": 5.452302631578947, "grad_norm": 0.04053517058491707, "learning_rate": 4.5513980263157895e-06, "loss": 0.0392, "step": 13260 }, { "epoch": 5.456414473684211, "grad_norm": 29.87954330444336, "learning_rate": 4.547286184210526e-06, "loss": 0.0388, "step": 13270 }, { "epoch": 5.4605263157894735, "grad_norm": 0.06873561441898346, "learning_rate": 4.543174342105264e-06, "loss": 0.0063, "step": 13280 }, { "epoch": 5.464638157894737, "grad_norm": 71.52073669433594, "learning_rate": 4.5390625e-06, "loss": 0.233, "step": 13290 }, { "epoch": 5.46875, "grad_norm": 1.1586495637893677, "learning_rate": 4.534950657894737e-06, "loss": 0.2069, "step": 13300 }, { "epoch": 5.472861842105263, "grad_norm": 4.539996147155762, "learning_rate": 4.530838815789474e-06, "loss": 0.0605, "step": 13310 }, { "epoch": 5.4769736842105265, "grad_norm": 0.469481498003006, "learning_rate": 4.526726973684211e-06, "loss": 0.0862, "step": 13320 }, { "epoch": 5.481085526315789, "grad_norm": 0.10244907438755035, "learning_rate": 4.522615131578948e-06, "loss": 0.191, "step": 13330 }, { "epoch": 5.485197368421053, "grad_norm": 0.026087487116456032, "learning_rate": 4.518503289473685e-06, "loss": 0.0111, "step": 13340 }, { "epoch": 5.489309210526316, "grad_norm": 0.050827547907829285, "learning_rate": 4.5143914473684216e-06, "loss": 0.0014, "step": 13350 }, { "epoch": 5.493421052631579, "grad_norm": 7.369386730715632e-05, "learning_rate": 4.510279605263158e-06, "loss": 0.1305, "step": 13360 }, { "epoch": 5.4975328947368425, "grad_norm": 0.5626145601272583, "learning_rate": 4.506167763157895e-06, "loss": 0.0828, "step": 13370 }, { "epoch": 5.501644736842105, "grad_norm": 83.1645278930664, "learning_rate": 4.502055921052632e-06, "loss": 0.1359, "step": 13380 }, { "epoch": 5.505756578947368, "grad_norm": 0.1569296270608902, "learning_rate": 4.497944078947369e-06, "loss": 0.0781, "step": 13390 }, { "epoch": 5.509868421052632, "grad_norm": 83.7970962524414, "learning_rate": 4.4938322368421055e-06, "loss": 0.1579, "step": 13400 }, { "epoch": 5.513980263157895, "grad_norm": 41.769187927246094, "learning_rate": 4.489720394736842e-06, "loss": 0.019, "step": 13410 }, { "epoch": 5.5180921052631575, "grad_norm": 0.12914538383483887, "learning_rate": 4.4856085526315795e-06, "loss": 0.0873, "step": 13420 }, { "epoch": 5.522203947368421, "grad_norm": 0.0012703635729849339, "learning_rate": 4.481496710526316e-06, "loss": 0.1208, "step": 13430 }, { "epoch": 5.526315789473684, "grad_norm": 14.571756362915039, "learning_rate": 4.477384868421053e-06, "loss": 0.0554, "step": 13440 }, { "epoch": 5.530427631578947, "grad_norm": 0.10961569845676422, "learning_rate": 4.473273026315789e-06, "loss": 0.2124, "step": 13450 }, { "epoch": 5.534539473684211, "grad_norm": 0.089634008705616, "learning_rate": 4.469161184210527e-06, "loss": 0.0901, "step": 13460 }, { "epoch": 5.5386513157894735, "grad_norm": 0.0024442391004413366, "learning_rate": 4.465049342105263e-06, "loss": 0.158, "step": 13470 }, { "epoch": 5.542763157894737, "grad_norm": 0.005259633529931307, "learning_rate": 4.460937500000001e-06, "loss": 0.1263, "step": 13480 }, { "epoch": 5.546875, "grad_norm": 0.020105471834540367, "learning_rate": 4.4568256578947375e-06, "loss": 0.0782, "step": 13490 }, { "epoch": 5.550986842105263, "grad_norm": 8.300959587097168, "learning_rate": 4.452713815789474e-06, "loss": 0.01, "step": 13500 }, { "epoch": 5.5550986842105265, "grad_norm": 0.009316924028098583, "learning_rate": 4.448601973684211e-06, "loss": 0.0182, "step": 13510 }, { "epoch": 5.559210526315789, "grad_norm": 0.0011465359712019563, "learning_rate": 4.444490131578948e-06, "loss": 0.0434, "step": 13520 }, { "epoch": 5.563322368421053, "grad_norm": 0.6433992981910706, "learning_rate": 4.440378289473685e-06, "loss": 0.0082, "step": 13530 }, { "epoch": 5.567434210526316, "grad_norm": 6.921369552612305, "learning_rate": 4.436266447368421e-06, "loss": 0.2029, "step": 13540 }, { "epoch": 5.571546052631579, "grad_norm": 0.5579603314399719, "learning_rate": 4.432154605263158e-06, "loss": 0.0043, "step": 13550 }, { "epoch": 5.5756578947368425, "grad_norm": 1.0528669357299805, "learning_rate": 4.4280427631578946e-06, "loss": 0.0507, "step": 13560 }, { "epoch": 5.579769736842105, "grad_norm": 0.24581840634346008, "learning_rate": 4.423930921052632e-06, "loss": 0.0037, "step": 13570 }, { "epoch": 5.583881578947368, "grad_norm": 0.12832553684711456, "learning_rate": 4.419819078947369e-06, "loss": 0.0006, "step": 13580 }, { "epoch": 5.587993421052632, "grad_norm": 0.013015109114348888, "learning_rate": 4.415707236842105e-06, "loss": 0.0041, "step": 13590 }, { "epoch": 5.592105263157895, "grad_norm": 0.06528721004724503, "learning_rate": 4.411595394736843e-06, "loss": 0.344, "step": 13600 }, { "epoch": 5.5962171052631575, "grad_norm": 7.076974391937256, "learning_rate": 4.407483552631579e-06, "loss": 0.0548, "step": 13610 }, { "epoch": 5.600328947368421, "grad_norm": 0.002825407776981592, "learning_rate": 4.403371710526316e-06, "loss": 0.2002, "step": 13620 }, { "epoch": 5.604440789473684, "grad_norm": 0.4160158336162567, "learning_rate": 4.399259868421053e-06, "loss": 0.0354, "step": 13630 }, { "epoch": 5.608552631578947, "grad_norm": 0.05299230292439461, "learning_rate": 4.39514802631579e-06, "loss": 0.0077, "step": 13640 }, { "epoch": 5.612664473684211, "grad_norm": 0.13743530213832855, "learning_rate": 4.391036184210527e-06, "loss": 0.0826, "step": 13650 }, { "epoch": 5.6167763157894735, "grad_norm": 2.4112481696647592e-05, "learning_rate": 4.386924342105264e-06, "loss": 0.0542, "step": 13660 }, { "epoch": 5.620888157894737, "grad_norm": 0.04345851391553879, "learning_rate": 4.382812500000001e-06, "loss": 0.0395, "step": 13670 }, { "epoch": 5.625, "grad_norm": 0.7097082138061523, "learning_rate": 4.378700657894737e-06, "loss": 0.0448, "step": 13680 }, { "epoch": 5.629111842105263, "grad_norm": 13.222297668457031, "learning_rate": 4.374588815789474e-06, "loss": 0.1012, "step": 13690 }, { "epoch": 5.6332236842105265, "grad_norm": 0.02765653096139431, "learning_rate": 4.3704769736842105e-06, "loss": 0.0137, "step": 13700 }, { "epoch": 5.637335526315789, "grad_norm": 1.4771395921707153, "learning_rate": 4.366365131578948e-06, "loss": 0.0751, "step": 13710 }, { "epoch": 5.641447368421053, "grad_norm": 0.03940972313284874, "learning_rate": 4.3622532894736845e-06, "loss": 0.0257, "step": 13720 }, { "epoch": 5.645559210526316, "grad_norm": 0.011548848822712898, "learning_rate": 4.358141447368421e-06, "loss": 0.0014, "step": 13730 }, { "epoch": 5.649671052631579, "grad_norm": 24.877702713012695, "learning_rate": 4.354029605263158e-06, "loss": 0.3987, "step": 13740 }, { "epoch": 5.6537828947368425, "grad_norm": 0.5792186856269836, "learning_rate": 4.349917763157895e-06, "loss": 0.002, "step": 13750 }, { "epoch": 5.657894736842105, "grad_norm": 2.246400833129883, "learning_rate": 4.345805921052632e-06, "loss": 0.0552, "step": 13760 }, { "epoch": 5.662006578947368, "grad_norm": 91.90821075439453, "learning_rate": 4.3416940789473684e-06, "loss": 0.1837, "step": 13770 }, { "epoch": 5.666118421052632, "grad_norm": 0.14475248754024506, "learning_rate": 4.337582236842106e-06, "loss": 0.0574, "step": 13780 }, { "epoch": 5.670230263157895, "grad_norm": 1.058624267578125, "learning_rate": 4.3334703947368425e-06, "loss": 0.1577, "step": 13790 }, { "epoch": 5.6743421052631575, "grad_norm": 0.0024513236712664366, "learning_rate": 4.32935855263158e-06, "loss": 0.0034, "step": 13800 }, { "epoch": 5.678453947368421, "grad_norm": 30.24159049987793, "learning_rate": 4.3252467105263166e-06, "loss": 0.0203, "step": 13810 }, { "epoch": 5.682565789473684, "grad_norm": 0.013055126182734966, "learning_rate": 4.321134868421053e-06, "loss": 0.0296, "step": 13820 }, { "epoch": 5.686677631578947, "grad_norm": 0.004864171613007784, "learning_rate": 4.31702302631579e-06, "loss": 0.0186, "step": 13830 }, { "epoch": 5.690789473684211, "grad_norm": 0.012071618810296059, "learning_rate": 4.312911184210526e-06, "loss": 0.0178, "step": 13840 }, { "epoch": 5.6949013157894735, "grad_norm": 0.709204375743866, "learning_rate": 4.308799342105264e-06, "loss": 0.0024, "step": 13850 }, { "epoch": 5.699013157894737, "grad_norm": 0.6648589968681335, "learning_rate": 4.3046875000000004e-06, "loss": 0.0004, "step": 13860 }, { "epoch": 5.703125, "grad_norm": 0.01644590124487877, "learning_rate": 4.300575657894737e-06, "loss": 0.0129, "step": 13870 }, { "epoch": 5.707236842105263, "grad_norm": 72.33856201171875, "learning_rate": 4.296463815789474e-06, "loss": 0.3645, "step": 13880 }, { "epoch": 5.7113486842105265, "grad_norm": 12.403470993041992, "learning_rate": 4.29235197368421e-06, "loss": 0.008, "step": 13890 }, { "epoch": 5.715460526315789, "grad_norm": 0.0032954413909465075, "learning_rate": 4.288240131578948e-06, "loss": 0.0349, "step": 13900 }, { "epoch": 5.719572368421053, "grad_norm": 63.751197814941406, "learning_rate": 4.284128289473684e-06, "loss": 0.0786, "step": 13910 }, { "epoch": 5.723684210526316, "grad_norm": 0.05703187361359596, "learning_rate": 4.280016447368421e-06, "loss": 0.0003, "step": 13920 }, { "epoch": 5.727796052631579, "grad_norm": 2.2651917934417725, "learning_rate": 4.275904605263158e-06, "loss": 0.0292, "step": 13930 }, { "epoch": 5.7319078947368425, "grad_norm": 0.029149610549211502, "learning_rate": 4.271792763157895e-06, "loss": 0.2247, "step": 13940 }, { "epoch": 5.736019736842105, "grad_norm": 0.013435146771371365, "learning_rate": 4.2676809210526325e-06, "loss": 0.1923, "step": 13950 }, { "epoch": 5.740131578947368, "grad_norm": 6.325374852167442e-05, "learning_rate": 4.263569078947369e-06, "loss": 0.0346, "step": 13960 }, { "epoch": 5.744243421052632, "grad_norm": 0.08834278583526611, "learning_rate": 4.259457236842106e-06, "loss": 0.0478, "step": 13970 }, { "epoch": 5.748355263157895, "grad_norm": 0.433755099773407, "learning_rate": 4.255345394736842e-06, "loss": 0.1382, "step": 13980 }, { "epoch": 5.7524671052631575, "grad_norm": 9.808202743530273, "learning_rate": 4.25123355263158e-06, "loss": 0.1213, "step": 13990 }, { "epoch": 5.756578947368421, "grad_norm": 0.3236713707447052, "learning_rate": 4.247121710526316e-06, "loss": 0.0581, "step": 14000 }, { "epoch": 5.760690789473684, "grad_norm": 0.0836513340473175, "learning_rate": 4.243009868421053e-06, "loss": 0.0769, "step": 14010 }, { "epoch": 5.764802631578947, "grad_norm": 0.01956997998058796, "learning_rate": 4.2388980263157896e-06, "loss": 0.0916, "step": 14020 }, { "epoch": 5.768914473684211, "grad_norm": 0.290725439786911, "learning_rate": 4.234786184210526e-06, "loss": 0.0507, "step": 14030 }, { "epoch": 5.7730263157894735, "grad_norm": 123.39529418945312, "learning_rate": 4.230674342105264e-06, "loss": 0.1676, "step": 14040 }, { "epoch": 5.777138157894737, "grad_norm": 1.4887871742248535, "learning_rate": 4.2265625e-06, "loss": 0.0391, "step": 14050 }, { "epoch": 5.78125, "grad_norm": 0.020826542750000954, "learning_rate": 4.222450657894737e-06, "loss": 0.0302, "step": 14060 }, { "epoch": 5.785361842105263, "grad_norm": 0.0018061811570078135, "learning_rate": 4.218338815789474e-06, "loss": 0.0297, "step": 14070 }, { "epoch": 5.7894736842105265, "grad_norm": 83.14344024658203, "learning_rate": 4.214226973684211e-06, "loss": 0.1815, "step": 14080 }, { "epoch": 5.793585526315789, "grad_norm": 0.048979900777339935, "learning_rate": 4.2101151315789475e-06, "loss": 0.015, "step": 14090 }, { "epoch": 5.797697368421053, "grad_norm": 0.09402741491794586, "learning_rate": 4.206003289473685e-06, "loss": 0.1655, "step": 14100 }, { "epoch": 5.801809210526316, "grad_norm": 0.022667359560728073, "learning_rate": 4.201891447368422e-06, "loss": 0.0383, "step": 14110 }, { "epoch": 5.805921052631579, "grad_norm": 0.004963419865816832, "learning_rate": 4.197779605263158e-06, "loss": 0.0758, "step": 14120 }, { "epoch": 5.8100328947368425, "grad_norm": 22.636215209960938, "learning_rate": 4.193667763157895e-06, "loss": 0.1386, "step": 14130 }, { "epoch": 5.814144736842105, "grad_norm": 0.4905577003955841, "learning_rate": 4.189555921052632e-06, "loss": 0.0355, "step": 14140 }, { "epoch": 5.818256578947368, "grad_norm": 0.06438548117876053, "learning_rate": 4.185444078947369e-06, "loss": 0.0085, "step": 14150 }, { "epoch": 5.822368421052632, "grad_norm": 27.258342742919922, "learning_rate": 4.1813322368421055e-06, "loss": 0.0948, "step": 14160 }, { "epoch": 5.826480263157895, "grad_norm": 0.06388669461011887, "learning_rate": 4.177220394736842e-06, "loss": 0.0785, "step": 14170 }, { "epoch": 5.8305921052631575, "grad_norm": 0.03271382674574852, "learning_rate": 4.1731085526315795e-06, "loss": 0.0534, "step": 14180 }, { "epoch": 5.834703947368421, "grad_norm": 0.18359790742397308, "learning_rate": 4.168996710526316e-06, "loss": 0.0073, "step": 14190 }, { "epoch": 5.838815789473684, "grad_norm": 0.005616027396172285, "learning_rate": 4.164884868421053e-06, "loss": 0.2368, "step": 14200 }, { "epoch": 5.842927631578947, "grad_norm": 0.2086600810289383, "learning_rate": 4.160773026315789e-06, "loss": 0.0106, "step": 14210 }, { "epoch": 5.847039473684211, "grad_norm": 18.045181274414062, "learning_rate": 4.156661184210527e-06, "loss": 0.2462, "step": 14220 }, { "epoch": 5.8511513157894735, "grad_norm": 4.4964518547058105, "learning_rate": 4.1525493421052634e-06, "loss": 0.3903, "step": 14230 }, { "epoch": 5.855263157894737, "grad_norm": 0.012703659944236279, "learning_rate": 4.1484375e-06, "loss": 0.0317, "step": 14240 }, { "epoch": 5.859375, "grad_norm": 2.445992946624756, "learning_rate": 4.1443256578947375e-06, "loss": 0.2889, "step": 14250 }, { "epoch": 5.863486842105263, "grad_norm": 1.1592912673950195, "learning_rate": 4.140213815789474e-06, "loss": 0.1458, "step": 14260 }, { "epoch": 5.8675986842105265, "grad_norm": 0.46567994356155396, "learning_rate": 4.136101973684211e-06, "loss": 0.0028, "step": 14270 }, { "epoch": 5.871710526315789, "grad_norm": 96.94190216064453, "learning_rate": 4.131990131578948e-06, "loss": 0.4599, "step": 14280 }, { "epoch": 5.875822368421053, "grad_norm": 0.22972775995731354, "learning_rate": 4.127878289473685e-06, "loss": 0.0446, "step": 14290 }, { "epoch": 5.879934210526316, "grad_norm": 0.0005772469448857009, "learning_rate": 4.123766447368421e-06, "loss": 0.0015, "step": 14300 }, { "epoch": 5.884046052631579, "grad_norm": 0.45011672377586365, "learning_rate": 4.119654605263158e-06, "loss": 0.0278, "step": 14310 }, { "epoch": 5.8881578947368425, "grad_norm": 0.02912212163209915, "learning_rate": 4.115542763157895e-06, "loss": 0.0088, "step": 14320 }, { "epoch": 5.892269736842105, "grad_norm": 1.1713041067123413, "learning_rate": 4.111430921052632e-06, "loss": 0.0243, "step": 14330 }, { "epoch": 5.896381578947368, "grad_norm": 59.63473892211914, "learning_rate": 4.107319078947369e-06, "loss": 0.0342, "step": 14340 }, { "epoch": 5.900493421052632, "grad_norm": 62.537635803222656, "learning_rate": 4.103207236842105e-06, "loss": 0.06, "step": 14350 }, { "epoch": 5.904605263157895, "grad_norm": 6.495685374829918e-05, "learning_rate": 4.099095394736842e-06, "loss": 0.0735, "step": 14360 }, { "epoch": 5.9087171052631575, "grad_norm": 3.6121654510498047, "learning_rate": 4.094983552631579e-06, "loss": 0.0735, "step": 14370 }, { "epoch": 5.912828947368421, "grad_norm": 0.03086622804403305, "learning_rate": 4.090871710526316e-06, "loss": 0.0034, "step": 14380 }, { "epoch": 5.916940789473684, "grad_norm": 0.07952877134084702, "learning_rate": 4.086759868421053e-06, "loss": 0.0433, "step": 14390 }, { "epoch": 5.921052631578947, "grad_norm": 0.419914573431015, "learning_rate": 4.08264802631579e-06, "loss": 0.1109, "step": 14400 }, { "epoch": 5.925164473684211, "grad_norm": 0.00708195473998785, "learning_rate": 4.078536184210527e-06, "loss": 0.0013, "step": 14410 }, { "epoch": 5.9292763157894735, "grad_norm": 0.025563985109329224, "learning_rate": 4.07483552631579e-06, "loss": 0.0536, "step": 14420 }, { "epoch": 5.933388157894737, "grad_norm": 0.0034198074135929346, "learning_rate": 4.070723684210527e-06, "loss": 0.0419, "step": 14430 }, { "epoch": 5.9375, "grad_norm": 0.005447972100228071, "learning_rate": 4.066611842105264e-06, "loss": 0.0912, "step": 14440 }, { "epoch": 5.941611842105263, "grad_norm": 0.10684799402952194, "learning_rate": 4.0625000000000005e-06, "loss": 0.026, "step": 14450 }, { "epoch": 5.9457236842105265, "grad_norm": 0.13118505477905273, "learning_rate": 4.058388157894737e-06, "loss": 0.0021, "step": 14460 }, { "epoch": 5.949835526315789, "grad_norm": 0.44725626707077026, "learning_rate": 4.054276315789474e-06, "loss": 0.0009, "step": 14470 }, { "epoch": 5.953947368421053, "grad_norm": 0.5507727861404419, "learning_rate": 4.050164473684211e-06, "loss": 0.0008, "step": 14480 }, { "epoch": 5.958059210526316, "grad_norm": 0.012128915637731552, "learning_rate": 4.046052631578948e-06, "loss": 0.0067, "step": 14490 }, { "epoch": 5.962171052631579, "grad_norm": 6.004103660583496, "learning_rate": 4.041940789473684e-06, "loss": 0.1394, "step": 14500 }, { "epoch": 5.9662828947368425, "grad_norm": 0.0007618535892106593, "learning_rate": 4.037828947368421e-06, "loss": 0.0646, "step": 14510 }, { "epoch": 5.970394736842105, "grad_norm": 0.003796122968196869, "learning_rate": 4.033717105263158e-06, "loss": 0.1546, "step": 14520 }, { "epoch": 5.974506578947368, "grad_norm": 92.0025863647461, "learning_rate": 4.029605263157895e-06, "loss": 0.2674, "step": 14530 }, { "epoch": 5.978618421052632, "grad_norm": 0.24422402679920197, "learning_rate": 4.025493421052632e-06, "loss": 0.3426, "step": 14540 }, { "epoch": 5.982730263157895, "grad_norm": 100.8232650756836, "learning_rate": 4.021381578947368e-06, "loss": 0.1401, "step": 14550 }, { "epoch": 5.9868421052631575, "grad_norm": 0.010361550375819206, "learning_rate": 4.017269736842106e-06, "loss": 0.1157, "step": 14560 }, { "epoch": 5.990953947368421, "grad_norm": 2.4274368286132812, "learning_rate": 4.013157894736842e-06, "loss": 0.0128, "step": 14570 }, { "epoch": 5.995065789473684, "grad_norm": 33.48968505859375, "learning_rate": 4.00904605263158e-06, "loss": 0.0646, "step": 14580 }, { "epoch": 5.999177631578947, "grad_norm": 0.3265540897846222, "learning_rate": 4.004934210526316e-06, "loss": 0.0041, "step": 14590 }, { "epoch": 6.003289473684211, "grad_norm": 0.12049008160829544, "learning_rate": 4.000822368421053e-06, "loss": 0.2218, "step": 14600 }, { "epoch": 6.0074013157894735, "grad_norm": 0.025286437943577766, "learning_rate": 3.99671052631579e-06, "loss": 0.0038, "step": 14610 }, { "epoch": 6.011513157894737, "grad_norm": 0.015311812050640583, "learning_rate": 3.992598684210527e-06, "loss": 0.002, "step": 14620 }, { "epoch": 6.015625, "grad_norm": 0.2929839491844177, "learning_rate": 3.988486842105264e-06, "loss": 0.0652, "step": 14630 }, { "epoch": 6.019736842105263, "grad_norm": 0.006915655452758074, "learning_rate": 3.984375e-06, "loss": 0.0182, "step": 14640 }, { "epoch": 6.0238486842105265, "grad_norm": 0.03598955273628235, "learning_rate": 3.980263157894737e-06, "loss": 0.0327, "step": 14650 }, { "epoch": 6.027960526315789, "grad_norm": 0.007388933561742306, "learning_rate": 3.9761513157894735e-06, "loss": 0.085, "step": 14660 }, { "epoch": 6.032072368421052, "grad_norm": 0.2169804871082306, "learning_rate": 3.972039473684211e-06, "loss": 0.1496, "step": 14670 }, { "epoch": 6.036184210526316, "grad_norm": 3.28409743309021, "learning_rate": 3.9679276315789476e-06, "loss": 0.0039, "step": 14680 }, { "epoch": 6.040296052631579, "grad_norm": 6.0432586669921875, "learning_rate": 3.963815789473684e-06, "loss": 0.0479, "step": 14690 }, { "epoch": 6.0444078947368425, "grad_norm": 5.428047180175781, "learning_rate": 3.959703947368422e-06, "loss": 0.0167, "step": 14700 }, { "epoch": 6.048519736842105, "grad_norm": 3.8892929553985596, "learning_rate": 3.955592105263158e-06, "loss": 0.0089, "step": 14710 }, { "epoch": 6.052631578947368, "grad_norm": 13.23945426940918, "learning_rate": 3.951480263157895e-06, "loss": 0.0926, "step": 14720 }, { "epoch": 6.056743421052632, "grad_norm": 0.0022840045858174562, "learning_rate": 3.947368421052632e-06, "loss": 0.0017, "step": 14730 }, { "epoch": 6.060855263157895, "grad_norm": 19.486738204956055, "learning_rate": 3.943256578947369e-06, "loss": 0.013, "step": 14740 }, { "epoch": 6.0649671052631575, "grad_norm": 0.0012240969808772206, "learning_rate": 3.9391447368421055e-06, "loss": 0.0591, "step": 14750 }, { "epoch": 6.069078947368421, "grad_norm": 7.582399845123291, "learning_rate": 3.935032894736843e-06, "loss": 0.0043, "step": 14760 }, { "epoch": 6.073190789473684, "grad_norm": 2.2928051948547363, "learning_rate": 3.93092105263158e-06, "loss": 0.0007, "step": 14770 }, { "epoch": 6.077302631578948, "grad_norm": 0.0408819355070591, "learning_rate": 3.926809210526316e-06, "loss": 0.1468, "step": 14780 }, { "epoch": 6.081414473684211, "grad_norm": 0.3422407805919647, "learning_rate": 3.922697368421053e-06, "loss": 0.0169, "step": 14790 }, { "epoch": 6.0855263157894735, "grad_norm": 0.14110057055950165, "learning_rate": 3.918585526315789e-06, "loss": 0.0263, "step": 14800 }, { "epoch": 6.089638157894737, "grad_norm": 0.011704709380865097, "learning_rate": 3.914473684210527e-06, "loss": 0.011, "step": 14810 }, { "epoch": 6.09375, "grad_norm": 0.21069417893886566, "learning_rate": 3.9103618421052635e-06, "loss": 0.0939, "step": 14820 }, { "epoch": 6.097861842105263, "grad_norm": 0.0002515464148018509, "learning_rate": 3.90625e-06, "loss": 0.1361, "step": 14830 }, { "epoch": 6.1019736842105265, "grad_norm": 0.18480344116687775, "learning_rate": 3.902138157894737e-06, "loss": 0.1573, "step": 14840 }, { "epoch": 6.106085526315789, "grad_norm": 0.015395952388644218, "learning_rate": 3.898026315789474e-06, "loss": 0.0741, "step": 14850 }, { "epoch": 6.110197368421052, "grad_norm": 0.2790496349334717, "learning_rate": 3.893914473684211e-06, "loss": 0.0041, "step": 14860 }, { "epoch": 6.114309210526316, "grad_norm": 0.017724014818668365, "learning_rate": 3.889802631578947e-06, "loss": 0.2005, "step": 14870 }, { "epoch": 6.118421052631579, "grad_norm": 0.0019499589689075947, "learning_rate": 3.885690789473685e-06, "loss": 0.0004, "step": 14880 }, { "epoch": 6.1225328947368425, "grad_norm": 0.21595421433448792, "learning_rate": 3.8815789473684214e-06, "loss": 0.0384, "step": 14890 }, { "epoch": 6.126644736842105, "grad_norm": 19.680784225463867, "learning_rate": 3.877467105263158e-06, "loss": 0.0066, "step": 14900 }, { "epoch": 6.130756578947368, "grad_norm": 0.0334845706820488, "learning_rate": 3.8733552631578955e-06, "loss": 0.0008, "step": 14910 }, { "epoch": 6.134868421052632, "grad_norm": 0.09054317325353622, "learning_rate": 3.869243421052632e-06, "loss": 0.0209, "step": 14920 }, { "epoch": 6.138980263157895, "grad_norm": 0.06103743240237236, "learning_rate": 3.865131578947369e-06, "loss": 0.3223, "step": 14930 }, { "epoch": 6.1430921052631575, "grad_norm": 0.028684088960289955, "learning_rate": 3.861019736842105e-06, "loss": 0.0032, "step": 14940 }, { "epoch": 6.147203947368421, "grad_norm": 0.00259952899068594, "learning_rate": 3.856907894736843e-06, "loss": 0.0267, "step": 14950 }, { "epoch": 6.151315789473684, "grad_norm": 0.0002748616971075535, "learning_rate": 3.852796052631579e-06, "loss": 0.094, "step": 14960 }, { "epoch": 6.155427631578948, "grad_norm": 53.90045166015625, "learning_rate": 3.848684210526316e-06, "loss": 0.0521, "step": 14970 }, { "epoch": 6.159539473684211, "grad_norm": 0.017857911065220833, "learning_rate": 3.844572368421053e-06, "loss": 0.0002, "step": 14980 }, { "epoch": 6.1636513157894735, "grad_norm": 4.2133588790893555, "learning_rate": 3.840460526315789e-06, "loss": 0.2049, "step": 14990 }, { "epoch": 6.167763157894737, "grad_norm": 0.0010838140733540058, "learning_rate": 3.836348684210527e-06, "loss": 0.1498, "step": 15000 }, { "epoch": 6.171875, "grad_norm": 0.25620415806770325, "learning_rate": 3.832236842105263e-06, "loss": 0.0029, "step": 15010 }, { "epoch": 6.175986842105263, "grad_norm": 102.45184326171875, "learning_rate": 3.828125000000001e-06, "loss": 0.1873, "step": 15020 }, { "epoch": 6.1800986842105265, "grad_norm": 0.4820612967014313, "learning_rate": 3.824013157894737e-06, "loss": 0.2001, "step": 15030 }, { "epoch": 6.184210526315789, "grad_norm": 64.73114776611328, "learning_rate": 3.819901315789474e-06, "loss": 0.0622, "step": 15040 }, { "epoch": 6.188322368421052, "grad_norm": 0.00912266131490469, "learning_rate": 3.815789473684211e-06, "loss": 0.0231, "step": 15050 }, { "epoch": 6.192434210526316, "grad_norm": 1.8277114629745483, "learning_rate": 3.8116776315789476e-06, "loss": 0.0014, "step": 15060 }, { "epoch": 6.196546052631579, "grad_norm": 0.056886494159698486, "learning_rate": 3.8075657894736846e-06, "loss": 0.0015, "step": 15070 }, { "epoch": 6.2006578947368425, "grad_norm": 0.10024416446685791, "learning_rate": 3.8034539473684212e-06, "loss": 0.0224, "step": 15080 }, { "epoch": 6.204769736842105, "grad_norm": 42.96096420288086, "learning_rate": 3.799342105263158e-06, "loss": 0.0894, "step": 15090 }, { "epoch": 6.208881578947368, "grad_norm": 0.007689566817134619, "learning_rate": 3.7952302631578953e-06, "loss": 0.1738, "step": 15100 }, { "epoch": 6.212993421052632, "grad_norm": 0.0625908225774765, "learning_rate": 3.791118421052632e-06, "loss": 0.1453, "step": 15110 }, { "epoch": 6.217105263157895, "grad_norm": 23.779741287231445, "learning_rate": 3.7870065789473685e-06, "loss": 0.0153, "step": 15120 }, { "epoch": 6.2212171052631575, "grad_norm": 0.0019070090493187308, "learning_rate": 3.7828947368421055e-06, "loss": 0.1492, "step": 15130 }, { "epoch": 6.225328947368421, "grad_norm": 0.03249938786029816, "learning_rate": 3.7787828947368426e-06, "loss": 0.1902, "step": 15140 }, { "epoch": 6.229440789473684, "grad_norm": 0.09956735372543335, "learning_rate": 3.7746710526315796e-06, "loss": 0.0199, "step": 15150 }, { "epoch": 6.233552631578948, "grad_norm": 0.005886862985789776, "learning_rate": 3.770559210526316e-06, "loss": 0.0007, "step": 15160 }, { "epoch": 6.237664473684211, "grad_norm": 3.7049739360809326, "learning_rate": 3.766447368421053e-06, "loss": 0.004, "step": 15170 }, { "epoch": 6.2417763157894735, "grad_norm": 53.633445739746094, "learning_rate": 3.7623355263157894e-06, "loss": 0.5929, "step": 15180 }, { "epoch": 6.245888157894737, "grad_norm": 1.0991520881652832, "learning_rate": 3.758223684210527e-06, "loss": 0.0172, "step": 15190 }, { "epoch": 6.25, "grad_norm": 0.020138980820775032, "learning_rate": 3.7541118421052635e-06, "loss": 0.0978, "step": 15200 }, { "epoch": 6.254111842105263, "grad_norm": 0.02193092741072178, "learning_rate": 3.7500000000000005e-06, "loss": 0.0205, "step": 15210 }, { "epoch": 6.2582236842105265, "grad_norm": 0.04015963524580002, "learning_rate": 3.745888157894737e-06, "loss": 0.0017, "step": 15220 }, { "epoch": 6.262335526315789, "grad_norm": 3.5081162452697754, "learning_rate": 3.7417763157894737e-06, "loss": 0.0017, "step": 15230 }, { "epoch": 6.266447368421053, "grad_norm": 0.0039045389275997877, "learning_rate": 3.737664473684211e-06, "loss": 0.0078, "step": 15240 }, { "epoch": 6.270559210526316, "grad_norm": 0.2885434627532959, "learning_rate": 3.733552631578948e-06, "loss": 0.0171, "step": 15250 }, { "epoch": 6.274671052631579, "grad_norm": 0.028393974527716637, "learning_rate": 3.7294407894736844e-06, "loss": 0.0788, "step": 15260 }, { "epoch": 6.2787828947368425, "grad_norm": 116.33313751220703, "learning_rate": 3.725328947368421e-06, "loss": 0.0904, "step": 15270 }, { "epoch": 6.282894736842105, "grad_norm": 0.009480939246714115, "learning_rate": 3.721217105263158e-06, "loss": 0.0602, "step": 15280 }, { "epoch": 6.287006578947368, "grad_norm": 2.470651865005493, "learning_rate": 3.717105263157895e-06, "loss": 0.0058, "step": 15290 }, { "epoch": 6.291118421052632, "grad_norm": 0.00010311637015547603, "learning_rate": 3.712993421052632e-06, "loss": 0.1697, "step": 15300 }, { "epoch": 6.295230263157895, "grad_norm": 2.6846992113860324e-06, "learning_rate": 3.7088815789473687e-06, "loss": 0.0135, "step": 15310 }, { "epoch": 6.2993421052631575, "grad_norm": 0.005294320173561573, "learning_rate": 3.7047697368421053e-06, "loss": 0.0015, "step": 15320 }, { "epoch": 6.303453947368421, "grad_norm": 0.05376331880688667, "learning_rate": 3.7006578947368428e-06, "loss": 0.2444, "step": 15330 }, { "epoch": 6.307565789473684, "grad_norm": 0.4994964897632599, "learning_rate": 3.6965460526315794e-06, "loss": 0.0348, "step": 15340 }, { "epoch": 6.311677631578947, "grad_norm": 0.04438924789428711, "learning_rate": 3.692434210526316e-06, "loss": 0.1409, "step": 15350 }, { "epoch": 6.315789473684211, "grad_norm": 2.267488718032837, "learning_rate": 3.688322368421053e-06, "loss": 0.01, "step": 15360 }, { "epoch": 6.3199013157894735, "grad_norm": 65.40839385986328, "learning_rate": 3.6842105263157896e-06, "loss": 0.1391, "step": 15370 }, { "epoch": 6.324013157894737, "grad_norm": 1.6076836585998535, "learning_rate": 3.6800986842105267e-06, "loss": 0.0277, "step": 15380 }, { "epoch": 6.328125, "grad_norm": 71.12222290039062, "learning_rate": 3.6759868421052637e-06, "loss": 0.3136, "step": 15390 }, { "epoch": 6.332236842105263, "grad_norm": 0.27795493602752686, "learning_rate": 3.6718750000000003e-06, "loss": 0.0337, "step": 15400 }, { "epoch": 6.3363486842105265, "grad_norm": 0.005055265035480261, "learning_rate": 3.667763157894737e-06, "loss": 0.0913, "step": 15410 }, { "epoch": 6.340460526315789, "grad_norm": 1.20720374584198, "learning_rate": 3.663651315789474e-06, "loss": 0.049, "step": 15420 }, { "epoch": 6.344572368421053, "grad_norm": 0.020946979522705078, "learning_rate": 3.659539473684211e-06, "loss": 0.0702, "step": 15430 }, { "epoch": 6.348684210526316, "grad_norm": 0.006545315962284803, "learning_rate": 3.6554276315789476e-06, "loss": 0.0369, "step": 15440 }, { "epoch": 6.352796052631579, "grad_norm": 0.05248495563864708, "learning_rate": 3.6513157894736846e-06, "loss": 0.0086, "step": 15450 }, { "epoch": 6.3569078947368425, "grad_norm": 19.267728805541992, "learning_rate": 3.6472039473684212e-06, "loss": 0.0154, "step": 15460 }, { "epoch": 6.361019736842105, "grad_norm": 0.8632445931434631, "learning_rate": 3.643092105263158e-06, "loss": 0.0369, "step": 15470 }, { "epoch": 6.365131578947368, "grad_norm": 13.235368728637695, "learning_rate": 3.6389802631578953e-06, "loss": 0.0172, "step": 15480 }, { "epoch": 6.369243421052632, "grad_norm": 0.0018560757162049413, "learning_rate": 3.634868421052632e-06, "loss": 0.0082, "step": 15490 }, { "epoch": 6.373355263157895, "grad_norm": 0.04792603850364685, "learning_rate": 3.6307565789473685e-06, "loss": 0.1249, "step": 15500 }, { "epoch": 6.3774671052631575, "grad_norm": 66.24674224853516, "learning_rate": 3.6266447368421055e-06, "loss": 0.0234, "step": 15510 }, { "epoch": 6.381578947368421, "grad_norm": 1.533945918083191, "learning_rate": 3.6225328947368426e-06, "loss": 0.0262, "step": 15520 }, { "epoch": 6.385690789473684, "grad_norm": 0.4053722620010376, "learning_rate": 3.618421052631579e-06, "loss": 0.0028, "step": 15530 }, { "epoch": 6.389802631578947, "grad_norm": 0.1870436817407608, "learning_rate": 3.6143092105263162e-06, "loss": 0.2569, "step": 15540 }, { "epoch": 6.393914473684211, "grad_norm": 0.31710654497146606, "learning_rate": 3.610197368421053e-06, "loss": 0.0286, "step": 15550 }, { "epoch": 6.3980263157894735, "grad_norm": 0.01970168575644493, "learning_rate": 3.6060855263157894e-06, "loss": 0.0044, "step": 15560 }, { "epoch": 6.402138157894737, "grad_norm": 0.22997547686100006, "learning_rate": 3.601973684210527e-06, "loss": 0.1054, "step": 15570 }, { "epoch": 6.40625, "grad_norm": 0.16296066343784332, "learning_rate": 3.5978618421052635e-06, "loss": 0.1698, "step": 15580 }, { "epoch": 6.410361842105263, "grad_norm": 0.0006991025293245912, "learning_rate": 3.59375e-06, "loss": 0.1825, "step": 15590 }, { "epoch": 6.4144736842105265, "grad_norm": 0.07622674107551575, "learning_rate": 3.589638157894737e-06, "loss": 0.1282, "step": 15600 }, { "epoch": 6.418585526315789, "grad_norm": 0.5237715244293213, "learning_rate": 3.5855263157894737e-06, "loss": 0.0007, "step": 15610 }, { "epoch": 6.422697368421053, "grad_norm": 0.9545769691467285, "learning_rate": 3.581414473684211e-06, "loss": 0.0279, "step": 15620 }, { "epoch": 6.426809210526316, "grad_norm": 0.0007075117900967598, "learning_rate": 3.577302631578948e-06, "loss": 0.2943, "step": 15630 }, { "epoch": 6.430921052631579, "grad_norm": 0.005581384990364313, "learning_rate": 3.5731907894736844e-06, "loss": 0.0005, "step": 15640 }, { "epoch": 6.4350328947368425, "grad_norm": 0.3551817238330841, "learning_rate": 3.569078947368421e-06, "loss": 0.1535, "step": 15650 }, { "epoch": 6.439144736842105, "grad_norm": 0.0016296847024932504, "learning_rate": 3.564967105263158e-06, "loss": 0.0004, "step": 15660 }, { "epoch": 6.443256578947368, "grad_norm": 0.0016560270451009274, "learning_rate": 3.560855263157895e-06, "loss": 0.0101, "step": 15670 }, { "epoch": 6.447368421052632, "grad_norm": 0.1265086531639099, "learning_rate": 3.556743421052632e-06, "loss": 0.0003, "step": 15680 }, { "epoch": 6.451480263157895, "grad_norm": 0.37500542402267456, "learning_rate": 3.5526315789473687e-06, "loss": 0.014, "step": 15690 }, { "epoch": 6.4555921052631575, "grad_norm": 0.041531626135110855, "learning_rate": 3.5485197368421053e-06, "loss": 0.0201, "step": 15700 }, { "epoch": 6.459703947368421, "grad_norm": 60.48843765258789, "learning_rate": 3.544407894736843e-06, "loss": 0.1084, "step": 15710 }, { "epoch": 6.463815789473684, "grad_norm": 0.0003151536511722952, "learning_rate": 3.5402960526315794e-06, "loss": 0.005, "step": 15720 }, { "epoch": 6.467927631578947, "grad_norm": 1.3518043756484985, "learning_rate": 3.536184210526316e-06, "loss": 0.002, "step": 15730 }, { "epoch": 6.472039473684211, "grad_norm": 0.019122233614325523, "learning_rate": 3.5320723684210526e-06, "loss": 0.0592, "step": 15740 }, { "epoch": 6.4761513157894735, "grad_norm": 0.022713489830493927, "learning_rate": 3.5279605263157897e-06, "loss": 0.0002, "step": 15750 }, { "epoch": 6.480263157894737, "grad_norm": 0.03588182106614113, "learning_rate": 3.5238486842105267e-06, "loss": 0.1249, "step": 15760 }, { "epoch": 6.484375, "grad_norm": 38.98967742919922, "learning_rate": 3.5197368421052637e-06, "loss": 0.0222, "step": 15770 }, { "epoch": 6.488486842105263, "grad_norm": 0.08748682588338852, "learning_rate": 3.5156250000000003e-06, "loss": 0.0345, "step": 15780 }, { "epoch": 6.4925986842105265, "grad_norm": 0.46851471066474915, "learning_rate": 3.511513157894737e-06, "loss": 0.0363, "step": 15790 }, { "epoch": 6.496710526315789, "grad_norm": 1.2130106687545776, "learning_rate": 3.5074013157894735e-06, "loss": 0.0027, "step": 15800 }, { "epoch": 6.500822368421053, "grad_norm": 0.11882048100233078, "learning_rate": 3.503289473684211e-06, "loss": 0.0275, "step": 15810 }, { "epoch": 6.504934210526316, "grad_norm": 0.19122253358364105, "learning_rate": 3.4991776315789476e-06, "loss": 0.0953, "step": 15820 }, { "epoch": 6.509046052631579, "grad_norm": 10.092044830322266, "learning_rate": 3.4950657894736846e-06, "loss": 0.1668, "step": 15830 }, { "epoch": 6.5131578947368425, "grad_norm": 0.1250755488872528, "learning_rate": 3.4909539473684212e-06, "loss": 0.0107, "step": 15840 }, { "epoch": 6.517269736842105, "grad_norm": 0.006768290884792805, "learning_rate": 3.486842105263158e-06, "loss": 0.0964, "step": 15850 }, { "epoch": 6.521381578947368, "grad_norm": 0.003787360619753599, "learning_rate": 3.4827302631578953e-06, "loss": 0.0818, "step": 15860 }, { "epoch": 6.525493421052632, "grad_norm": 0.004058108199387789, "learning_rate": 3.478618421052632e-06, "loss": 0.0004, "step": 15870 }, { "epoch": 6.529605263157895, "grad_norm": 1.6648340225219727, "learning_rate": 3.4745065789473685e-06, "loss": 0.0036, "step": 15880 }, { "epoch": 6.5337171052631575, "grad_norm": 0.0452425591647625, "learning_rate": 3.4703947368421056e-06, "loss": 0.0892, "step": 15890 }, { "epoch": 6.537828947368421, "grad_norm": 0.038435883820056915, "learning_rate": 3.4662828947368426e-06, "loss": 0.109, "step": 15900 }, { "epoch": 6.541940789473684, "grad_norm": 0.17975053191184998, "learning_rate": 3.462171052631579e-06, "loss": 0.0345, "step": 15910 }, { "epoch": 6.546052631578947, "grad_norm": 0.018029751256108284, "learning_rate": 3.4580592105263162e-06, "loss": 0.2231, "step": 15920 }, { "epoch": 6.550164473684211, "grad_norm": 100.19669342041016, "learning_rate": 3.453947368421053e-06, "loss": 0.264, "step": 15930 }, { "epoch": 6.5542763157894735, "grad_norm": 0.0011808631243184209, "learning_rate": 3.4498355263157895e-06, "loss": 0.0064, "step": 15940 }, { "epoch": 6.558388157894737, "grad_norm": 0.053727325052022934, "learning_rate": 3.445723684210527e-06, "loss": 0.0457, "step": 15950 }, { "epoch": 6.5625, "grad_norm": 0.07882308214902878, "learning_rate": 3.4416118421052635e-06, "loss": 0.0161, "step": 15960 }, { "epoch": 6.566611842105263, "grad_norm": 0.0010185951832681894, "learning_rate": 3.4375e-06, "loss": 0.3123, "step": 15970 }, { "epoch": 6.5707236842105265, "grad_norm": 0.03667188435792923, "learning_rate": 3.433388157894737e-06, "loss": 0.0293, "step": 15980 }, { "epoch": 6.574835526315789, "grad_norm": 0.4083603322505951, "learning_rate": 3.4292763157894738e-06, "loss": 0.0157, "step": 15990 }, { "epoch": 6.578947368421053, "grad_norm": 0.5004896521568298, "learning_rate": 3.4251644736842112e-06, "loss": 0.0088, "step": 16000 }, { "epoch": 6.583059210526316, "grad_norm": 0.7514389753341675, "learning_rate": 3.421052631578948e-06, "loss": 0.0133, "step": 16010 }, { "epoch": 6.587171052631579, "grad_norm": 7.884642601013184, "learning_rate": 3.4169407894736844e-06, "loss": 0.0301, "step": 16020 }, { "epoch": 6.5912828947368425, "grad_norm": 0.028393534943461418, "learning_rate": 3.412828947368421e-06, "loss": 0.0043, "step": 16030 }, { "epoch": 6.595394736842105, "grad_norm": 6.707031726837158, "learning_rate": 3.408717105263158e-06, "loss": 0.0033, "step": 16040 }, { "epoch": 6.599506578947368, "grad_norm": 0.5291396379470825, "learning_rate": 3.404605263157895e-06, "loss": 0.0036, "step": 16050 }, { "epoch": 6.603618421052632, "grad_norm": 0.9197802543640137, "learning_rate": 3.4004934210526317e-06, "loss": 0.0796, "step": 16060 }, { "epoch": 6.607730263157895, "grad_norm": 0.025339746847748756, "learning_rate": 3.3963815789473687e-06, "loss": 0.012, "step": 16070 }, { "epoch": 6.6118421052631575, "grad_norm": 0.10476145893335342, "learning_rate": 3.3922697368421054e-06, "loss": 0.0095, "step": 16080 }, { "epoch": 6.615953947368421, "grad_norm": 15.861358642578125, "learning_rate": 3.388157894736843e-06, "loss": 0.0038, "step": 16090 }, { "epoch": 6.620065789473684, "grad_norm": 0.002507624914869666, "learning_rate": 3.3840460526315794e-06, "loss": 0.2463, "step": 16100 }, { "epoch": 6.624177631578947, "grad_norm": 46.91939926147461, "learning_rate": 3.379934210526316e-06, "loss": 0.1331, "step": 16110 }, { "epoch": 6.628289473684211, "grad_norm": 77.30012512207031, "learning_rate": 3.3758223684210526e-06, "loss": 0.1453, "step": 16120 }, { "epoch": 6.6324013157894735, "grad_norm": 0.0014009532751515508, "learning_rate": 3.3717105263157897e-06, "loss": 0.0051, "step": 16130 }, { "epoch": 6.636513157894737, "grad_norm": 1.946133017539978, "learning_rate": 3.3675986842105267e-06, "loss": 0.1491, "step": 16140 }, { "epoch": 6.640625, "grad_norm": 44.568275451660156, "learning_rate": 3.3634868421052637e-06, "loss": 0.0212, "step": 16150 }, { "epoch": 6.644736842105263, "grad_norm": 46.899925231933594, "learning_rate": 3.3593750000000003e-06, "loss": 0.0164, "step": 16160 }, { "epoch": 6.6488486842105265, "grad_norm": 55.38142776489258, "learning_rate": 3.355263157894737e-06, "loss": 0.027, "step": 16170 }, { "epoch": 6.652960526315789, "grad_norm": 44.49644088745117, "learning_rate": 3.3511513157894736e-06, "loss": 0.0183, "step": 16180 }, { "epoch": 6.657072368421053, "grad_norm": 9.747554577188566e-05, "learning_rate": 3.347039473684211e-06, "loss": 0.0015, "step": 16190 }, { "epoch": 6.661184210526316, "grad_norm": 7.189190364442766e-05, "learning_rate": 3.3429276315789476e-06, "loss": 0.121, "step": 16200 }, { "epoch": 6.665296052631579, "grad_norm": 0.22454024851322174, "learning_rate": 3.3388157894736847e-06, "loss": 0.0552, "step": 16210 }, { "epoch": 6.6694078947368425, "grad_norm": 0.09667879343032837, "learning_rate": 3.3347039473684213e-06, "loss": 0.1034, "step": 16220 }, { "epoch": 6.673519736842105, "grad_norm": 0.00030903020524419844, "learning_rate": 3.330592105263158e-06, "loss": 0.0216, "step": 16230 }, { "epoch": 6.677631578947368, "grad_norm": 0.0006744477432221174, "learning_rate": 3.3264802631578953e-06, "loss": 0.029, "step": 16240 }, { "epoch": 6.681743421052632, "grad_norm": 0.0019801906310021877, "learning_rate": 3.322368421052632e-06, "loss": 0.0155, "step": 16250 }, { "epoch": 6.685855263157895, "grad_norm": 0.041232798248529434, "learning_rate": 3.3182565789473685e-06, "loss": 0.001, "step": 16260 }, { "epoch": 6.6899671052631575, "grad_norm": 0.022349294275045395, "learning_rate": 3.314144736842105e-06, "loss": 0.0077, "step": 16270 }, { "epoch": 6.694078947368421, "grad_norm": 0.00042771780863404274, "learning_rate": 3.3100328947368426e-06, "loss": 0.0002, "step": 16280 }, { "epoch": 6.698190789473684, "grad_norm": 0.32936057448387146, "learning_rate": 3.3059210526315792e-06, "loss": 0.001, "step": 16290 }, { "epoch": 6.702302631578947, "grad_norm": 1.45549738407135, "learning_rate": 3.3018092105263162e-06, "loss": 0.0842, "step": 16300 }, { "epoch": 6.706414473684211, "grad_norm": 75.73883056640625, "learning_rate": 3.297697368421053e-06, "loss": 0.0506, "step": 16310 }, { "epoch": 6.7105263157894735, "grad_norm": 0.0010807811049744487, "learning_rate": 3.2935855263157895e-06, "loss": 0.0477, "step": 16320 }, { "epoch": 6.714638157894737, "grad_norm": 8.679156303405762, "learning_rate": 3.289473684210527e-06, "loss": 0.0093, "step": 16330 }, { "epoch": 6.71875, "grad_norm": 0.07848314195871353, "learning_rate": 3.2853618421052635e-06, "loss": 0.0009, "step": 16340 }, { "epoch": 6.722861842105263, "grad_norm": 0.0014197869459167123, "learning_rate": 3.28125e-06, "loss": 0.1809, "step": 16350 }, { "epoch": 6.7269736842105265, "grad_norm": 0.10961536318063736, "learning_rate": 3.277138157894737e-06, "loss": 0.0595, "step": 16360 }, { "epoch": 6.731085526315789, "grad_norm": 0.028819315135478973, "learning_rate": 3.2730263157894738e-06, "loss": 0.035, "step": 16370 }, { "epoch": 6.735197368421053, "grad_norm": 0.12262340635061264, "learning_rate": 3.268914473684211e-06, "loss": 0.3158, "step": 16380 }, { "epoch": 6.739309210526316, "grad_norm": 0.001396475243382156, "learning_rate": 3.264802631578948e-06, "loss": 0.0022, "step": 16390 }, { "epoch": 6.743421052631579, "grad_norm": 0.009128180332481861, "learning_rate": 3.2606907894736844e-06, "loss": 0.0893, "step": 16400 }, { "epoch": 6.7475328947368425, "grad_norm": 0.012526153586804867, "learning_rate": 3.256578947368421e-06, "loss": 0.0013, "step": 16410 }, { "epoch": 6.751644736842105, "grad_norm": 0.8247601389884949, "learning_rate": 3.252467105263158e-06, "loss": 0.0028, "step": 16420 }, { "epoch": 6.755756578947368, "grad_norm": 0.003459361381828785, "learning_rate": 3.248355263157895e-06, "loss": 0.0008, "step": 16430 }, { "epoch": 6.759868421052632, "grad_norm": 0.006843214388936758, "learning_rate": 3.2446546052631583e-06, "loss": 0.0352, "step": 16440 }, { "epoch": 6.763980263157895, "grad_norm": 0.20814478397369385, "learning_rate": 3.240542763157895e-06, "loss": 0.0066, "step": 16450 }, { "epoch": 6.7680921052631575, "grad_norm": 0.029610252007842064, "learning_rate": 3.236430921052632e-06, "loss": 0.0001, "step": 16460 }, { "epoch": 6.772203947368421, "grad_norm": 0.811451256275177, "learning_rate": 3.2323190789473686e-06, "loss": 0.0861, "step": 16470 }, { "epoch": 6.776315789473684, "grad_norm": 0.24721278250217438, "learning_rate": 3.2282072368421056e-06, "loss": 0.016, "step": 16480 }, { "epoch": 6.780427631578947, "grad_norm": 85.86888122558594, "learning_rate": 3.2240953947368426e-06, "loss": 0.2855, "step": 16490 }, { "epoch": 6.784539473684211, "grad_norm": 0.6534148454666138, "learning_rate": 3.2199835526315792e-06, "loss": 0.0448, "step": 16500 }, { "epoch": 6.7886513157894735, "grad_norm": 0.04840293154120445, "learning_rate": 3.215871710526316e-06, "loss": 0.0451, "step": 16510 }, { "epoch": 6.792763157894737, "grad_norm": 0.07276813685894012, "learning_rate": 3.211759868421053e-06, "loss": 0.0015, "step": 16520 }, { "epoch": 6.796875, "grad_norm": 0.16435430943965912, "learning_rate": 3.20764802631579e-06, "loss": 0.0069, "step": 16530 }, { "epoch": 6.800986842105263, "grad_norm": 11.423922538757324, "learning_rate": 3.2035361842105265e-06, "loss": 0.0218, "step": 16540 }, { "epoch": 6.8050986842105265, "grad_norm": 10.006841659545898, "learning_rate": 3.1994243421052636e-06, "loss": 0.0037, "step": 16550 }, { "epoch": 6.809210526315789, "grad_norm": 0.2944813072681427, "learning_rate": 3.1953125e-06, "loss": 0.0209, "step": 16560 }, { "epoch": 6.813322368421053, "grad_norm": 0.027711883187294006, "learning_rate": 3.1912006578947368e-06, "loss": 0.346, "step": 16570 }, { "epoch": 6.817434210526316, "grad_norm": 0.01937483809888363, "learning_rate": 3.1870888157894742e-06, "loss": 0.018, "step": 16580 }, { "epoch": 6.821546052631579, "grad_norm": 0.11694393306970596, "learning_rate": 3.182976973684211e-06, "loss": 0.0014, "step": 16590 }, { "epoch": 6.8256578947368425, "grad_norm": 0.011637609452009201, "learning_rate": 3.1788651315789474e-06, "loss": 0.0743, "step": 16600 }, { "epoch": 6.829769736842105, "grad_norm": 0.0008141595753841102, "learning_rate": 3.1747532894736845e-06, "loss": 0.009, "step": 16610 }, { "epoch": 6.833881578947368, "grad_norm": 0.004597888328135014, "learning_rate": 3.170641447368421e-06, "loss": 0.0006, "step": 16620 }, { "epoch": 6.837993421052632, "grad_norm": 2.2619147300720215, "learning_rate": 3.1665296052631585e-06, "loss": 0.0054, "step": 16630 }, { "epoch": 6.842105263157895, "grad_norm": 0.02606726624071598, "learning_rate": 3.162417763157895e-06, "loss": 0.1062, "step": 16640 }, { "epoch": 6.8462171052631575, "grad_norm": 0.62782222032547, "learning_rate": 3.1583059210526318e-06, "loss": 0.1222, "step": 16650 }, { "epoch": 6.850328947368421, "grad_norm": 2.099074125289917, "learning_rate": 3.1541940789473684e-06, "loss": 0.0009, "step": 16660 }, { "epoch": 6.854440789473684, "grad_norm": 0.028837570920586586, "learning_rate": 3.150082236842106e-06, "loss": 0.3018, "step": 16670 }, { "epoch": 6.858552631578947, "grad_norm": 0.07627784460783005, "learning_rate": 3.1459703947368424e-06, "loss": 0.0038, "step": 16680 }, { "epoch": 6.862664473684211, "grad_norm": 0.03027813322842121, "learning_rate": 3.141858552631579e-06, "loss": 0.009, "step": 16690 }, { "epoch": 6.8667763157894735, "grad_norm": 0.024021049961447716, "learning_rate": 3.137746710526316e-06, "loss": 0.0797, "step": 16700 }, { "epoch": 6.870888157894737, "grad_norm": 0.0008662066538818181, "learning_rate": 3.1336348684210527e-06, "loss": 0.0039, "step": 16710 }, { "epoch": 6.875, "grad_norm": 0.003181145526468754, "learning_rate": 3.12952302631579e-06, "loss": 0.1774, "step": 16720 }, { "epoch": 6.879111842105263, "grad_norm": 0.35272935032844543, "learning_rate": 3.1254111842105267e-06, "loss": 0.0915, "step": 16730 }, { "epoch": 6.8832236842105265, "grad_norm": 5.994056701660156, "learning_rate": 3.1212993421052634e-06, "loss": 0.1812, "step": 16740 }, { "epoch": 6.887335526315789, "grad_norm": 0.1723736673593521, "learning_rate": 3.1171875e-06, "loss": 0.0865, "step": 16750 }, { "epoch": 6.891447368421053, "grad_norm": 0.045521121472120285, "learning_rate": 3.113075657894737e-06, "loss": 0.0032, "step": 16760 }, { "epoch": 6.895559210526316, "grad_norm": 25.4680233001709, "learning_rate": 3.108963815789474e-06, "loss": 0.0145, "step": 16770 }, { "epoch": 6.899671052631579, "grad_norm": 42.64589309692383, "learning_rate": 3.104851973684211e-06, "loss": 0.3033, "step": 16780 }, { "epoch": 6.9037828947368425, "grad_norm": 0.0010777267161756754, "learning_rate": 3.1007401315789477e-06, "loss": 0.0036, "step": 16790 }, { "epoch": 6.907894736842105, "grad_norm": 0.05889376997947693, "learning_rate": 3.0966282894736843e-06, "loss": 0.006, "step": 16800 }, { "epoch": 6.912006578947368, "grad_norm": 2.3862788677215576, "learning_rate": 3.092516447368421e-06, "loss": 0.0118, "step": 16810 }, { "epoch": 6.916118421052632, "grad_norm": 0.4769994020462036, "learning_rate": 3.0884046052631583e-06, "loss": 0.1088, "step": 16820 }, { "epoch": 6.920230263157895, "grad_norm": 11.872325897216797, "learning_rate": 3.084292763157895e-06, "loss": 0.0055, "step": 16830 }, { "epoch": 6.9243421052631575, "grad_norm": 1.2848358154296875, "learning_rate": 3.080180921052632e-06, "loss": 0.0026, "step": 16840 }, { "epoch": 6.928453947368421, "grad_norm": 0.01796242967247963, "learning_rate": 3.0760690789473686e-06, "loss": 0.0634, "step": 16850 }, { "epoch": 6.932565789473684, "grad_norm": 1.0784401893615723, "learning_rate": 3.0719572368421056e-06, "loss": 0.0024, "step": 16860 }, { "epoch": 6.936677631578947, "grad_norm": 1.7011362314224243, "learning_rate": 3.0678453947368426e-06, "loss": 0.0013, "step": 16870 }, { "epoch": 6.940789473684211, "grad_norm": 0.2864052653312683, "learning_rate": 3.0637335526315793e-06, "loss": 0.1147, "step": 16880 }, { "epoch": 6.9449013157894735, "grad_norm": 0.4255252480506897, "learning_rate": 3.059621710526316e-06, "loss": 0.0022, "step": 16890 }, { "epoch": 6.949013157894737, "grad_norm": 0.007732719648629427, "learning_rate": 3.0555098684210525e-06, "loss": 0.0004, "step": 16900 }, { "epoch": 6.953125, "grad_norm": 1.8991835117340088, "learning_rate": 3.05139802631579e-06, "loss": 0.0041, "step": 16910 }, { "epoch": 6.957236842105263, "grad_norm": 0.32865986227989197, "learning_rate": 3.0472861842105265e-06, "loss": 0.0413, "step": 16920 }, { "epoch": 6.9613486842105265, "grad_norm": 2.4880166053771973, "learning_rate": 3.0431743421052636e-06, "loss": 0.0635, "step": 16930 }, { "epoch": 6.965460526315789, "grad_norm": 17.090551376342773, "learning_rate": 3.0390625e-06, "loss": 0.2446, "step": 16940 }, { "epoch": 6.969572368421053, "grad_norm": 0.25987255573272705, "learning_rate": 3.0349506578947368e-06, "loss": 0.3464, "step": 16950 }, { "epoch": 6.973684210526316, "grad_norm": 0.030250955373048782, "learning_rate": 3.0308388157894742e-06, "loss": 0.1271, "step": 16960 }, { "epoch": 6.977796052631579, "grad_norm": 1.7025675773620605, "learning_rate": 3.026726973684211e-06, "loss": 0.0192, "step": 16970 }, { "epoch": 6.9819078947368425, "grad_norm": 0.005293203983455896, "learning_rate": 3.0226151315789475e-06, "loss": 0.0584, "step": 16980 }, { "epoch": 6.986019736842105, "grad_norm": 0.010041214525699615, "learning_rate": 3.0185032894736845e-06, "loss": 0.1611, "step": 16990 }, { "epoch": 6.990131578947368, "grad_norm": 0.4960373342037201, "learning_rate": 3.014391447368421e-06, "loss": 0.0782, "step": 17000 }, { "epoch": 6.994243421052632, "grad_norm": 0.0012437421828508377, "learning_rate": 3.010279605263158e-06, "loss": 0.0094, "step": 17010 }, { "epoch": 6.998355263157895, "grad_norm": 0.10035240650177002, "learning_rate": 3.006167763157895e-06, "loss": 0.0813, "step": 17020 }, { "epoch": 7.0024671052631575, "grad_norm": 0.008433901704847813, "learning_rate": 3.0020559210526318e-06, "loss": 0.0069, "step": 17030 }, { "epoch": 7.006578947368421, "grad_norm": 0.015171228908002377, "learning_rate": 2.9979440789473684e-06, "loss": 0.0541, "step": 17040 }, { "epoch": 7.010690789473684, "grad_norm": 0.0029152543283998966, "learning_rate": 2.993832236842106e-06, "loss": 0.0029, "step": 17050 }, { "epoch": 7.014802631578948, "grad_norm": 60.9827880859375, "learning_rate": 2.9897203947368424e-06, "loss": 0.0344, "step": 17060 }, { "epoch": 7.018914473684211, "grad_norm": 0.023271074518561363, "learning_rate": 2.985608552631579e-06, "loss": 0.0913, "step": 17070 }, { "epoch": 7.0230263157894735, "grad_norm": 1.9478614330291748, "learning_rate": 2.981496710526316e-06, "loss": 0.0033, "step": 17080 }, { "epoch": 7.027138157894737, "grad_norm": 0.0011102440766990185, "learning_rate": 2.9773848684210527e-06, "loss": 0.0003, "step": 17090 }, { "epoch": 7.03125, "grad_norm": 0.15013249218463898, "learning_rate": 2.97327302631579e-06, "loss": 0.037, "step": 17100 }, { "epoch": 7.035361842105263, "grad_norm": 27.406702041625977, "learning_rate": 2.9691611842105268e-06, "loss": 0.0286, "step": 17110 }, { "epoch": 7.0394736842105265, "grad_norm": 21.793682098388672, "learning_rate": 2.9650493421052634e-06, "loss": 0.007, "step": 17120 }, { "epoch": 7.043585526315789, "grad_norm": 17.29696273803711, "learning_rate": 2.9609375e-06, "loss": 0.0202, "step": 17130 }, { "epoch": 7.047697368421052, "grad_norm": 0.0060730380937457085, "learning_rate": 2.956825657894737e-06, "loss": 0.1069, "step": 17140 }, { "epoch": 7.051809210526316, "grad_norm": 15.568187713623047, "learning_rate": 2.952713815789474e-06, "loss": 0.007, "step": 17150 }, { "epoch": 7.055921052631579, "grad_norm": 0.008214758709073067, "learning_rate": 2.948601973684211e-06, "loss": 0.0032, "step": 17160 }, { "epoch": 7.0600328947368425, "grad_norm": 0.024761829525232315, "learning_rate": 2.9444901315789477e-06, "loss": 0.0378, "step": 17170 }, { "epoch": 7.064144736842105, "grad_norm": 2.6922731194645166e-05, "learning_rate": 2.9403782894736843e-06, "loss": 0.0021, "step": 17180 }, { "epoch": 7.068256578947368, "grad_norm": 0.38131505250930786, "learning_rate": 2.936266447368421e-06, "loss": 0.0004, "step": 17190 }, { "epoch": 7.072368421052632, "grad_norm": 12.381378173828125, "learning_rate": 2.9321546052631584e-06, "loss": 0.0054, "step": 17200 }, { "epoch": 7.076480263157895, "grad_norm": 0.8414236903190613, "learning_rate": 2.928042763157895e-06, "loss": 0.0006, "step": 17210 }, { "epoch": 7.0805921052631575, "grad_norm": 3.8690497875213623, "learning_rate": 2.9239309210526316e-06, "loss": 0.0018, "step": 17220 }, { "epoch": 7.084703947368421, "grad_norm": 0.09639491885900497, "learning_rate": 2.9198190789473686e-06, "loss": 0.1549, "step": 17230 }, { "epoch": 7.088815789473684, "grad_norm": 1.0091197490692139, "learning_rate": 2.9157072368421056e-06, "loss": 0.1204, "step": 17240 }, { "epoch": 7.092927631578948, "grad_norm": 0.03747154399752617, "learning_rate": 2.9115953947368427e-06, "loss": 0.0481, "step": 17250 }, { "epoch": 7.097039473684211, "grad_norm": 0.06905113905668259, "learning_rate": 2.9074835526315793e-06, "loss": 0.1313, "step": 17260 }, { "epoch": 7.1011513157894735, "grad_norm": 76.20909881591797, "learning_rate": 2.903371710526316e-06, "loss": 0.1095, "step": 17270 }, { "epoch": 7.105263157894737, "grad_norm": 0.022789154201745987, "learning_rate": 2.8992598684210525e-06, "loss": 0.0382, "step": 17280 }, { "epoch": 7.109375, "grad_norm": 0.2253035455942154, "learning_rate": 2.89514802631579e-06, "loss": 0.0017, "step": 17290 }, { "epoch": 7.113486842105263, "grad_norm": 0.5477920174598694, "learning_rate": 2.8910361842105266e-06, "loss": 0.0379, "step": 17300 }, { "epoch": 7.1175986842105265, "grad_norm": 0.000436304573668167, "learning_rate": 2.8869243421052636e-06, "loss": 0.2187, "step": 17310 }, { "epoch": 7.121710526315789, "grad_norm": 0.0398329421877861, "learning_rate": 2.8828125e-06, "loss": 0.0004, "step": 17320 }, { "epoch": 7.125822368421052, "grad_norm": 0.03360723704099655, "learning_rate": 2.878700657894737e-06, "loss": 0.0016, "step": 17330 }, { "epoch": 7.129934210526316, "grad_norm": 0.07153233885765076, "learning_rate": 2.8745888157894743e-06, "loss": 0.0569, "step": 17340 }, { "epoch": 7.134046052631579, "grad_norm": 89.36862182617188, "learning_rate": 2.870476973684211e-06, "loss": 0.0574, "step": 17350 }, { "epoch": 7.1381578947368425, "grad_norm": 8.033440589904785, "learning_rate": 2.8663651315789475e-06, "loss": 0.0031, "step": 17360 }, { "epoch": 7.142269736842105, "grad_norm": 0.07727915793657303, "learning_rate": 2.8622532894736845e-06, "loss": 0.0011, "step": 17370 }, { "epoch": 7.146381578947368, "grad_norm": 0.718957781791687, "learning_rate": 2.858141447368421e-06, "loss": 0.0052, "step": 17380 }, { "epoch": 7.150493421052632, "grad_norm": 0.12261500954627991, "learning_rate": 2.854029605263158e-06, "loss": 0.1109, "step": 17390 }, { "epoch": 7.154605263157895, "grad_norm": 0.0148479538038373, "learning_rate": 2.849917763157895e-06, "loss": 0.0013, "step": 17400 }, { "epoch": 7.1587171052631575, "grad_norm": 0.01821858249604702, "learning_rate": 2.8458059210526318e-06, "loss": 0.0625, "step": 17410 }, { "epoch": 7.162828947368421, "grad_norm": 0.0008887009462341666, "learning_rate": 2.8416940789473684e-06, "loss": 0.022, "step": 17420 }, { "epoch": 7.166940789473684, "grad_norm": 0.004944947548210621, "learning_rate": 2.837582236842106e-06, "loss": 0.0002, "step": 17430 }, { "epoch": 7.171052631578948, "grad_norm": 0.042470525950193405, "learning_rate": 2.8334703947368425e-06, "loss": 0.0414, "step": 17440 }, { "epoch": 7.175164473684211, "grad_norm": 0.09561196714639664, "learning_rate": 2.8297697368421057e-06, "loss": 0.1359, "step": 17450 }, { "epoch": 7.1792763157894735, "grad_norm": 0.0038538984954357147, "learning_rate": 2.8256578947368423e-06, "loss": 0.0158, "step": 17460 }, { "epoch": 7.183388157894737, "grad_norm": 0.00038770309765823185, "learning_rate": 2.8215460526315793e-06, "loss": 0.1407, "step": 17470 }, { "epoch": 7.1875, "grad_norm": 0.02665071003139019, "learning_rate": 2.817434210526316e-06, "loss": 0.2346, "step": 17480 }, { "epoch": 7.191611842105263, "grad_norm": 0.09127389639616013, "learning_rate": 2.813322368421053e-06, "loss": 0.0522, "step": 17490 }, { "epoch": 7.1957236842105265, "grad_norm": 0.006527309771627188, "learning_rate": 2.80921052631579e-06, "loss": 0.0368, "step": 17500 }, { "epoch": 7.199835526315789, "grad_norm": 0.01165294460952282, "learning_rate": 2.8050986842105266e-06, "loss": 0.0099, "step": 17510 }, { "epoch": 7.203947368421052, "grad_norm": 0.053283970803022385, "learning_rate": 2.800986842105263e-06, "loss": 0.0007, "step": 17520 }, { "epoch": 7.208059210526316, "grad_norm": 0.0038871471770107746, "learning_rate": 2.796875e-06, "loss": 0.0005, "step": 17530 }, { "epoch": 7.212171052631579, "grad_norm": 0.072207972407341, "learning_rate": 2.7927631578947373e-06, "loss": 0.1669, "step": 17540 }, { "epoch": 7.2162828947368425, "grad_norm": 21.420900344848633, "learning_rate": 2.788651315789474e-06, "loss": 0.1412, "step": 17550 }, { "epoch": 7.220394736842105, "grad_norm": 0.31269371509552, "learning_rate": 2.784539473684211e-06, "loss": 0.0002, "step": 17560 }, { "epoch": 7.224506578947368, "grad_norm": 0.005971134640276432, "learning_rate": 2.7804276315789475e-06, "loss": 0.0058, "step": 17570 }, { "epoch": 7.228618421052632, "grad_norm": 0.02348402515053749, "learning_rate": 2.776315789473684e-06, "loss": 0.0503, "step": 17580 }, { "epoch": 7.232730263157895, "grad_norm": 0.0007353632245212793, "learning_rate": 2.7722039473684216e-06, "loss": 0.1017, "step": 17590 }, { "epoch": 7.2368421052631575, "grad_norm": 0.0007847705273889005, "learning_rate": 2.768092105263158e-06, "loss": 0.0003, "step": 17600 }, { "epoch": 7.240953947368421, "grad_norm": 0.3893216848373413, "learning_rate": 2.7639802631578948e-06, "loss": 0.056, "step": 17610 }, { "epoch": 7.245065789473684, "grad_norm": 0.0016018354799598455, "learning_rate": 2.759868421052632e-06, "loss": 0.0792, "step": 17620 }, { "epoch": 7.249177631578948, "grad_norm": 0.056312862783670425, "learning_rate": 2.755756578947369e-06, "loss": 0.3022, "step": 17630 }, { "epoch": 7.253289473684211, "grad_norm": 1.7281204462051392, "learning_rate": 2.7516447368421055e-06, "loss": 0.0159, "step": 17640 }, { "epoch": 7.2574013157894735, "grad_norm": 0.3130502700805664, "learning_rate": 2.7475328947368425e-06, "loss": 0.002, "step": 17650 }, { "epoch": 7.261513157894737, "grad_norm": 47.450931549072266, "learning_rate": 2.743421052631579e-06, "loss": 0.0829, "step": 17660 }, { "epoch": 7.265625, "grad_norm": 26.058509826660156, "learning_rate": 2.7393092105263157e-06, "loss": 0.0375, "step": 17670 }, { "epoch": 7.269736842105263, "grad_norm": 0.39331546425819397, "learning_rate": 2.735197368421053e-06, "loss": 0.0011, "step": 17680 }, { "epoch": 7.2738486842105265, "grad_norm": 6.446929910453036e-05, "learning_rate": 2.7310855263157898e-06, "loss": 0.0001, "step": 17690 }, { "epoch": 7.277960526315789, "grad_norm": 0.02226644940674305, "learning_rate": 2.7269736842105264e-06, "loss": 0.032, "step": 17700 }, { "epoch": 7.282072368421053, "grad_norm": 69.86528778076172, "learning_rate": 2.7228618421052634e-06, "loss": 0.0477, "step": 17710 }, { "epoch": 7.286184210526316, "grad_norm": 0.06220513954758644, "learning_rate": 2.71875e-06, "loss": 0.0539, "step": 17720 }, { "epoch": 7.290296052631579, "grad_norm": 66.28457641601562, "learning_rate": 2.7146381578947375e-06, "loss": 0.0337, "step": 17730 }, { "epoch": 7.2944078947368425, "grad_norm": 0.001533765229396522, "learning_rate": 2.710526315789474e-06, "loss": 0.0013, "step": 17740 }, { "epoch": 7.298519736842105, "grad_norm": 0.00953629519790411, "learning_rate": 2.7064144736842107e-06, "loss": 0.0065, "step": 17750 }, { "epoch": 7.302631578947368, "grad_norm": 0.0008454016642645001, "learning_rate": 2.7023026315789473e-06, "loss": 0.0112, "step": 17760 }, { "epoch": 7.306743421052632, "grad_norm": 81.20194244384766, "learning_rate": 2.6981907894736843e-06, "loss": 0.3135, "step": 17770 }, { "epoch": 7.310855263157895, "grad_norm": 0.013238240964710712, "learning_rate": 2.6940789473684214e-06, "loss": 0.0044, "step": 17780 }, { "epoch": 7.3149671052631575, "grad_norm": 0.06490977853536606, "learning_rate": 2.6899671052631584e-06, "loss": 0.0383, "step": 17790 }, { "epoch": 7.319078947368421, "grad_norm": 21.222938537597656, "learning_rate": 2.685855263157895e-06, "loss": 0.0087, "step": 17800 }, { "epoch": 7.323190789473684, "grad_norm": 0.05870138481259346, "learning_rate": 2.6817434210526316e-06, "loss": 0.0385, "step": 17810 }, { "epoch": 7.327302631578947, "grad_norm": 0.10026119649410248, "learning_rate": 2.677631578947369e-06, "loss": 0.009, "step": 17820 }, { "epoch": 7.331414473684211, "grad_norm": 0.018641915172338486, "learning_rate": 2.6735197368421057e-06, "loss": 0.0752, "step": 17830 }, { "epoch": 7.3355263157894735, "grad_norm": 25.40374755859375, "learning_rate": 2.6694078947368423e-06, "loss": 0.0081, "step": 17840 }, { "epoch": 7.339638157894737, "grad_norm": 0.01974809169769287, "learning_rate": 2.665296052631579e-06, "loss": 0.0384, "step": 17850 }, { "epoch": 7.34375, "grad_norm": 0.013562479056417942, "learning_rate": 2.661184210526316e-06, "loss": 0.0005, "step": 17860 }, { "epoch": 7.347861842105263, "grad_norm": 0.0006354886572808027, "learning_rate": 2.657072368421053e-06, "loss": 0.0303, "step": 17870 }, { "epoch": 7.3519736842105265, "grad_norm": 0.045913152396678925, "learning_rate": 2.65296052631579e-06, "loss": 0.0004, "step": 17880 }, { "epoch": 7.356085526315789, "grad_norm": 0.5759618878364563, "learning_rate": 2.6488486842105266e-06, "loss": 0.0016, "step": 17890 }, { "epoch": 7.360197368421053, "grad_norm": 0.031576257199048996, "learning_rate": 2.644736842105263e-06, "loss": 0.0129, "step": 17900 }, { "epoch": 7.364309210526316, "grad_norm": 0.002541609574109316, "learning_rate": 2.640625e-06, "loss": 0.0144, "step": 17910 }, { "epoch": 7.368421052631579, "grad_norm": 11.867419242858887, "learning_rate": 2.6365131578947373e-06, "loss": 0.0057, "step": 17920 }, { "epoch": 7.3725328947368425, "grad_norm": 0.06471031159162521, "learning_rate": 2.632401315789474e-06, "loss": 0.03, "step": 17930 }, { "epoch": 7.376644736842105, "grad_norm": 0.12563598155975342, "learning_rate": 2.628289473684211e-06, "loss": 0.0092, "step": 17940 }, { "epoch": 7.380756578947368, "grad_norm": 0.03712340444326401, "learning_rate": 2.6241776315789475e-06, "loss": 0.0716, "step": 17950 }, { "epoch": 7.384868421052632, "grad_norm": 0.05920384079217911, "learning_rate": 2.620065789473684e-06, "loss": 0.0106, "step": 17960 }, { "epoch": 7.388980263157895, "grad_norm": 20.164339065551758, "learning_rate": 2.6159539473684216e-06, "loss": 0.2053, "step": 17970 }, { "epoch": 7.3930921052631575, "grad_norm": 0.33210375905036926, "learning_rate": 2.611842105263158e-06, "loss": 0.0211, "step": 17980 }, { "epoch": 7.397203947368421, "grad_norm": 0.42094624042510986, "learning_rate": 2.607730263157895e-06, "loss": 0.0053, "step": 17990 }, { "epoch": 7.401315789473684, "grad_norm": 5.063699722290039, "learning_rate": 2.603618421052632e-06, "loss": 0.1868, "step": 18000 }, { "epoch": 7.405427631578947, "grad_norm": 0.022125262767076492, "learning_rate": 2.599506578947369e-06, "loss": 0.1222, "step": 18010 }, { "epoch": 7.409539473684211, "grad_norm": 0.017482653260231018, "learning_rate": 2.5953947368421055e-06, "loss": 0.056, "step": 18020 }, { "epoch": 7.4136513157894735, "grad_norm": 0.00026524471468292177, "learning_rate": 2.5912828947368425e-06, "loss": 0.0495, "step": 18030 }, { "epoch": 7.417763157894737, "grad_norm": 59.36870574951172, "learning_rate": 2.587171052631579e-06, "loss": 0.0286, "step": 18040 }, { "epoch": 7.421875, "grad_norm": 0.0069809104315936565, "learning_rate": 2.5830592105263157e-06, "loss": 0.0115, "step": 18050 }, { "epoch": 7.425986842105263, "grad_norm": 0.05242922902107239, "learning_rate": 2.578947368421053e-06, "loss": 0.0008, "step": 18060 }, { "epoch": 7.4300986842105265, "grad_norm": 0.04204375296831131, "learning_rate": 2.5748355263157898e-06, "loss": 0.0118, "step": 18070 }, { "epoch": 7.434210526315789, "grad_norm": 0.1886630803346634, "learning_rate": 2.5707236842105264e-06, "loss": 0.0003, "step": 18080 }, { "epoch": 7.438322368421053, "grad_norm": 0.8755178451538086, "learning_rate": 2.5666118421052634e-06, "loss": 0.0237, "step": 18090 }, { "epoch": 7.442434210526316, "grad_norm": 8.999763488769531, "learning_rate": 2.5625e-06, "loss": 0.0263, "step": 18100 }, { "epoch": 7.446546052631579, "grad_norm": 0.20764632523059845, "learning_rate": 2.558388157894737e-06, "loss": 0.0012, "step": 18110 }, { "epoch": 7.4506578947368425, "grad_norm": 0.0067911092191934586, "learning_rate": 2.554276315789474e-06, "loss": 0.1123, "step": 18120 }, { "epoch": 7.454769736842105, "grad_norm": 0.9484823346138, "learning_rate": 2.5501644736842107e-06, "loss": 0.034, "step": 18130 }, { "epoch": 7.458881578947368, "grad_norm": 0.0008993869996629655, "learning_rate": 2.5460526315789473e-06, "loss": 0.0037, "step": 18140 }, { "epoch": 7.462993421052632, "grad_norm": 0.0014803741360083222, "learning_rate": 2.5419407894736843e-06, "loss": 0.5715, "step": 18150 }, { "epoch": 7.467105263157895, "grad_norm": 0.00023283193877432495, "learning_rate": 2.5378289473684214e-06, "loss": 0.0531, "step": 18160 }, { "epoch": 7.4712171052631575, "grad_norm": 0.002006068592891097, "learning_rate": 2.533717105263158e-06, "loss": 0.0066, "step": 18170 }, { "epoch": 7.475328947368421, "grad_norm": 0.025551754981279373, "learning_rate": 2.529605263157895e-06, "loss": 0.0429, "step": 18180 }, { "epoch": 7.479440789473684, "grad_norm": 0.07115461677312851, "learning_rate": 2.5254934210526316e-06, "loss": 0.0069, "step": 18190 }, { "epoch": 7.483552631578947, "grad_norm": 0.29560112953186035, "learning_rate": 2.521381578947369e-06, "loss": 0.0207, "step": 18200 }, { "epoch": 7.487664473684211, "grad_norm": 0.2443089634180069, "learning_rate": 2.5172697368421057e-06, "loss": 0.0065, "step": 18210 }, { "epoch": 7.4917763157894735, "grad_norm": 0.015790261328220367, "learning_rate": 2.5131578947368423e-06, "loss": 0.1727, "step": 18220 }, { "epoch": 7.495888157894737, "grad_norm": 0.0037406887859106064, "learning_rate": 2.509046052631579e-06, "loss": 0.0239, "step": 18230 }, { "epoch": 7.5, "grad_norm": 0.0033218811731785536, "learning_rate": 2.504934210526316e-06, "loss": 0.0043, "step": 18240 }, { "epoch": 7.504111842105263, "grad_norm": 4.251826763153076, "learning_rate": 2.500822368421053e-06, "loss": 0.0046, "step": 18250 }, { "epoch": 7.5082236842105265, "grad_norm": 0.12638850510120392, "learning_rate": 2.49671052631579e-06, "loss": 0.001, "step": 18260 }, { "epoch": 7.512335526315789, "grad_norm": 0.43785837292671204, "learning_rate": 2.4925986842105266e-06, "loss": 0.0017, "step": 18270 }, { "epoch": 7.516447368421053, "grad_norm": 95.06336212158203, "learning_rate": 2.4884868421052632e-06, "loss": 0.3436, "step": 18280 }, { "epoch": 7.520559210526316, "grad_norm": 0.06603921949863434, "learning_rate": 2.4843750000000002e-06, "loss": 0.0048, "step": 18290 }, { "epoch": 7.524671052631579, "grad_norm": 29.374561309814453, "learning_rate": 2.480263157894737e-06, "loss": 0.1906, "step": 18300 }, { "epoch": 7.5287828947368425, "grad_norm": 0.49131685495376587, "learning_rate": 2.476151315789474e-06, "loss": 0.0013, "step": 18310 }, { "epoch": 7.532894736842105, "grad_norm": 0.01167258806526661, "learning_rate": 2.472039473684211e-06, "loss": 0.0053, "step": 18320 }, { "epoch": 7.537006578947368, "grad_norm": 0.024763192981481552, "learning_rate": 2.4679276315789475e-06, "loss": 0.0126, "step": 18330 }, { "epoch": 7.541118421052632, "grad_norm": 0.02219541370868683, "learning_rate": 2.4638157894736846e-06, "loss": 0.0045, "step": 18340 }, { "epoch": 7.545230263157895, "grad_norm": 0.0026065174024552107, "learning_rate": 2.459703947368421e-06, "loss": 0.0004, "step": 18350 }, { "epoch": 7.5493421052631575, "grad_norm": 2.9072539806365967, "learning_rate": 2.455592105263158e-06, "loss": 0.1266, "step": 18360 }, { "epoch": 7.553453947368421, "grad_norm": 0.2119685709476471, "learning_rate": 2.451480263157895e-06, "loss": 0.0018, "step": 18370 }, { "epoch": 7.557565789473684, "grad_norm": 0.914875864982605, "learning_rate": 2.447368421052632e-06, "loss": 0.1778, "step": 18380 }, { "epoch": 7.561677631578947, "grad_norm": 2.1064213342469884e-06, "learning_rate": 2.4432565789473684e-06, "loss": 0.0, "step": 18390 }, { "epoch": 7.565789473684211, "grad_norm": 0.06288709491491318, "learning_rate": 2.4391447368421055e-06, "loss": 0.001, "step": 18400 }, { "epoch": 7.5699013157894735, "grad_norm": 0.0016081187641248107, "learning_rate": 2.4350328947368425e-06, "loss": 0.1154, "step": 18410 }, { "epoch": 7.574013157894737, "grad_norm": 0.10475052893161774, "learning_rate": 2.430921052631579e-06, "loss": 0.0003, "step": 18420 }, { "epoch": 7.578125, "grad_norm": 0.010752451606094837, "learning_rate": 2.426809210526316e-06, "loss": 0.0367, "step": 18430 }, { "epoch": 7.582236842105263, "grad_norm": 0.016824735328555107, "learning_rate": 2.4226973684210528e-06, "loss": 0.0011, "step": 18440 }, { "epoch": 7.5863486842105265, "grad_norm": 0.20019060373306274, "learning_rate": 2.41858552631579e-06, "loss": 0.0791, "step": 18450 }, { "epoch": 7.590460526315789, "grad_norm": 0.0019403524929657578, "learning_rate": 2.4144736842105264e-06, "loss": 0.0171, "step": 18460 }, { "epoch": 7.594572368421053, "grad_norm": 0.0023165084421634674, "learning_rate": 2.4103618421052634e-06, "loss": 0.1922, "step": 18470 }, { "epoch": 7.598684210526316, "grad_norm": 40.954689025878906, "learning_rate": 2.40625e-06, "loss": 0.0143, "step": 18480 }, { "epoch": 7.602796052631579, "grad_norm": 0.0013854552526026964, "learning_rate": 2.402138157894737e-06, "loss": 0.0003, "step": 18490 }, { "epoch": 7.6069078947368425, "grad_norm": 0.0016220198012888432, "learning_rate": 2.398026315789474e-06, "loss": 0.4475, "step": 18500 }, { "epoch": 7.611019736842105, "grad_norm": 0.048832572996616364, "learning_rate": 2.3939144736842107e-06, "loss": 0.0003, "step": 18510 }, { "epoch": 7.615131578947368, "grad_norm": 5.190423488616943, "learning_rate": 2.3898026315789473e-06, "loss": 0.0027, "step": 18520 }, { "epoch": 7.619243421052632, "grad_norm": 0.9031723737716675, "learning_rate": 2.3856907894736844e-06, "loss": 0.0028, "step": 18530 }, { "epoch": 7.623355263157895, "grad_norm": 0.02315152995288372, "learning_rate": 2.381578947368421e-06, "loss": 0.1872, "step": 18540 }, { "epoch": 7.6274671052631575, "grad_norm": 0.025038328021764755, "learning_rate": 2.377467105263158e-06, "loss": 0.0054, "step": 18550 }, { "epoch": 7.631578947368421, "grad_norm": 0.27843013405799866, "learning_rate": 2.373355263157895e-06, "loss": 0.0178, "step": 18560 }, { "epoch": 7.635690789473684, "grad_norm": 0.24566157162189484, "learning_rate": 2.369243421052632e-06, "loss": 0.0026, "step": 18570 }, { "epoch": 7.639802631578947, "grad_norm": 0.00402486976236105, "learning_rate": 2.3651315789473687e-06, "loss": 0.0115, "step": 18580 }, { "epoch": 7.643914473684211, "grad_norm": 0.11018090695142746, "learning_rate": 2.3610197368421053e-06, "loss": 0.0342, "step": 18590 }, { "epoch": 7.6480263157894735, "grad_norm": 0.10057216137647629, "learning_rate": 2.3569078947368423e-06, "loss": 0.0003, "step": 18600 }, { "epoch": 7.652138157894737, "grad_norm": 0.00027278883499093354, "learning_rate": 2.352796052631579e-06, "loss": 0.0003, "step": 18610 }, { "epoch": 7.65625, "grad_norm": 56.96162796020508, "learning_rate": 2.348684210526316e-06, "loss": 0.0221, "step": 18620 }, { "epoch": 7.660361842105263, "grad_norm": 0.32547372579574585, "learning_rate": 2.344572368421053e-06, "loss": 0.0014, "step": 18630 }, { "epoch": 7.6644736842105265, "grad_norm": 0.0017805839888751507, "learning_rate": 2.3404605263157896e-06, "loss": 0.001, "step": 18640 }, { "epoch": 7.668585526315789, "grad_norm": 0.003879399737343192, "learning_rate": 2.3363486842105266e-06, "loss": 0.0014, "step": 18650 }, { "epoch": 7.672697368421053, "grad_norm": 0.0014848548453301191, "learning_rate": 2.3322368421052632e-06, "loss": 0.0116, "step": 18660 }, { "epoch": 7.676809210526316, "grad_norm": 12.23731517791748, "learning_rate": 2.3281250000000003e-06, "loss": 0.0269, "step": 18670 }, { "epoch": 7.680921052631579, "grad_norm": 0.0011852079769596457, "learning_rate": 2.324013157894737e-06, "loss": 0.0164, "step": 18680 }, { "epoch": 7.6850328947368425, "grad_norm": 0.19763517379760742, "learning_rate": 2.319901315789474e-06, "loss": 0.0009, "step": 18690 }, { "epoch": 7.689144736842105, "grad_norm": 0.48111191391944885, "learning_rate": 2.3157894736842105e-06, "loss": 0.0044, "step": 18700 }, { "epoch": 7.693256578947368, "grad_norm": 35.88668441772461, "learning_rate": 2.3116776315789475e-06, "loss": 0.018, "step": 18710 }, { "epoch": 7.697368421052632, "grad_norm": 0.4510961174964905, "learning_rate": 2.3075657894736846e-06, "loss": 0.0201, "step": 18720 }, { "epoch": 7.701480263157895, "grad_norm": 0.5701677799224854, "learning_rate": 2.303453947368421e-06, "loss": 0.0481, "step": 18730 }, { "epoch": 7.7055921052631575, "grad_norm": 0.023179659619927406, "learning_rate": 2.2993421052631582e-06, "loss": 0.0016, "step": 18740 }, { "epoch": 7.709703947368421, "grad_norm": 22.315807342529297, "learning_rate": 2.295230263157895e-06, "loss": 0.0161, "step": 18750 }, { "epoch": 7.713815789473684, "grad_norm": 0.0018141778418794274, "learning_rate": 2.291118421052632e-06, "loss": 0.054, "step": 18760 }, { "epoch": 7.717927631578947, "grad_norm": 0.030514366924762726, "learning_rate": 2.2870065789473685e-06, "loss": 0.0029, "step": 18770 }, { "epoch": 7.722039473684211, "grad_norm": 55.70794677734375, "learning_rate": 2.2828947368421055e-06, "loss": 0.0208, "step": 18780 }, { "epoch": 7.7261513157894735, "grad_norm": 2.907081127166748, "learning_rate": 2.2787828947368425e-06, "loss": 0.1681, "step": 18790 }, { "epoch": 7.730263157894737, "grad_norm": 6.280221462249756, "learning_rate": 2.274671052631579e-06, "loss": 0.0125, "step": 18800 }, { "epoch": 7.734375, "grad_norm": 0.02076088637113571, "learning_rate": 2.270559210526316e-06, "loss": 0.0321, "step": 18810 }, { "epoch": 7.738486842105263, "grad_norm": 119.19054412841797, "learning_rate": 2.2664473684210528e-06, "loss": 0.2288, "step": 18820 }, { "epoch": 7.7425986842105265, "grad_norm": 0.0002872719196602702, "learning_rate": 2.26233552631579e-06, "loss": 0.0028, "step": 18830 }, { "epoch": 7.746710526315789, "grad_norm": 0.004000790882855654, "learning_rate": 2.2582236842105264e-06, "loss": 0.0001, "step": 18840 }, { "epoch": 7.750822368421053, "grad_norm": 8.02085018157959, "learning_rate": 2.2541118421052634e-06, "loss": 0.1302, "step": 18850 }, { "epoch": 7.754934210526316, "grad_norm": 0.011402469128370285, "learning_rate": 2.25e-06, "loss": 0.1687, "step": 18860 }, { "epoch": 7.759046052631579, "grad_norm": 0.03788752481341362, "learning_rate": 2.245888157894737e-06, "loss": 0.0934, "step": 18870 }, { "epoch": 7.7631578947368425, "grad_norm": 0.01945221796631813, "learning_rate": 2.241776315789474e-06, "loss": 0.0411, "step": 18880 }, { "epoch": 7.767269736842105, "grad_norm": 0.009651098400354385, "learning_rate": 2.2376644736842107e-06, "loss": 0.0012, "step": 18890 }, { "epoch": 7.771381578947368, "grad_norm": 0.5178136825561523, "learning_rate": 2.2335526315789473e-06, "loss": 0.0013, "step": 18900 }, { "epoch": 7.775493421052632, "grad_norm": 0.00043934083078056574, "learning_rate": 2.2294407894736844e-06, "loss": 0.0033, "step": 18910 }, { "epoch": 7.779605263157895, "grad_norm": 0.4319761097431183, "learning_rate": 2.225328947368421e-06, "loss": 0.0091, "step": 18920 }, { "epoch": 7.7837171052631575, "grad_norm": 0.0020860438235104084, "learning_rate": 2.221217105263158e-06, "loss": 0.0012, "step": 18930 }, { "epoch": 7.787828947368421, "grad_norm": 0.00016367706120945513, "learning_rate": 2.217105263157895e-06, "loss": 0.0004, "step": 18940 }, { "epoch": 7.791940789473684, "grad_norm": 0.043262697756290436, "learning_rate": 2.212993421052632e-06, "loss": 0.0086, "step": 18950 }, { "epoch": 7.796052631578947, "grad_norm": 0.04485470429062843, "learning_rate": 2.2088815789473687e-06, "loss": 0.0615, "step": 18960 }, { "epoch": 7.800164473684211, "grad_norm": 0.005395176820456982, "learning_rate": 2.2047697368421053e-06, "loss": 0.0035, "step": 18970 }, { "epoch": 7.8042763157894735, "grad_norm": 0.009486041963100433, "learning_rate": 2.2006578947368423e-06, "loss": 0.0013, "step": 18980 }, { "epoch": 7.808388157894737, "grad_norm": 0.0067543457262218, "learning_rate": 2.196546052631579e-06, "loss": 0.0006, "step": 18990 }, { "epoch": 7.8125, "grad_norm": 4.130396366119385, "learning_rate": 2.192434210526316e-06, "loss": 0.1369, "step": 19000 }, { "epoch": 7.816611842105263, "grad_norm": 0.011135872453451157, "learning_rate": 2.1883223684210526e-06, "loss": 0.0014, "step": 19010 }, { "epoch": 7.8207236842105265, "grad_norm": 11.25094223022461, "learning_rate": 2.1842105263157896e-06, "loss": 0.0195, "step": 19020 }, { "epoch": 7.824835526315789, "grad_norm": 0.0633421316742897, "learning_rate": 2.1800986842105266e-06, "loss": 0.0052, "step": 19030 }, { "epoch": 7.828947368421053, "grad_norm": 1.6162339448928833, "learning_rate": 2.1759868421052632e-06, "loss": 0.0023, "step": 19040 }, { "epoch": 7.833059210526316, "grad_norm": 0.2471773475408554, "learning_rate": 2.1718750000000003e-06, "loss": 0.0224, "step": 19050 }, { "epoch": 7.837171052631579, "grad_norm": 0.005709551740437746, "learning_rate": 2.167763157894737e-06, "loss": 0.051, "step": 19060 }, { "epoch": 7.8412828947368425, "grad_norm": 0.0015900834696367383, "learning_rate": 2.163651315789474e-06, "loss": 0.0017, "step": 19070 }, { "epoch": 7.845394736842105, "grad_norm": 0.2822861969470978, "learning_rate": 2.1595394736842105e-06, "loss": 0.0009, "step": 19080 }, { "epoch": 7.849506578947368, "grad_norm": 0.12618885934352875, "learning_rate": 2.1554276315789476e-06, "loss": 0.0029, "step": 19090 }, { "epoch": 7.853618421052632, "grad_norm": 0.14742045104503632, "learning_rate": 2.1513157894736846e-06, "loss": 0.0061, "step": 19100 }, { "epoch": 7.857730263157895, "grad_norm": 0.021756095811724663, "learning_rate": 2.147203947368421e-06, "loss": 0.0007, "step": 19110 }, { "epoch": 7.8618421052631575, "grad_norm": 0.06931746006011963, "learning_rate": 2.1430921052631582e-06, "loss": 0.0055, "step": 19120 }, { "epoch": 7.865953947368421, "grad_norm": 0.003343462711200118, "learning_rate": 2.138980263157895e-06, "loss": 0.0014, "step": 19130 }, { "epoch": 7.870065789473684, "grad_norm": 0.07024644315242767, "learning_rate": 2.134868421052632e-06, "loss": 0.0004, "step": 19140 }, { "epoch": 7.874177631578947, "grad_norm": 0.19473795592784882, "learning_rate": 2.1307565789473685e-06, "loss": 0.0041, "step": 19150 }, { "epoch": 7.878289473684211, "grad_norm": 0.02276478335261345, "learning_rate": 2.1266447368421055e-06, "loss": 0.0011, "step": 19160 }, { "epoch": 7.8824013157894735, "grad_norm": 0.04341767355799675, "learning_rate": 2.122532894736842e-06, "loss": 0.0002, "step": 19170 }, { "epoch": 7.886513157894737, "grad_norm": 0.0044844504445791245, "learning_rate": 2.118421052631579e-06, "loss": 0.0388, "step": 19180 }, { "epoch": 7.890625, "grad_norm": 2.5301058292388916, "learning_rate": 2.114309210526316e-06, "loss": 0.0032, "step": 19190 }, { "epoch": 7.894736842105263, "grad_norm": 4.735191345214844, "learning_rate": 2.110197368421053e-06, "loss": 0.2667, "step": 19200 }, { "epoch": 7.8988486842105265, "grad_norm": 0.2628767788410187, "learning_rate": 2.10608552631579e-06, "loss": 0.0011, "step": 19210 }, { "epoch": 7.902960526315789, "grad_norm": 0.0044731805101037025, "learning_rate": 2.1019736842105264e-06, "loss": 0.0383, "step": 19220 }, { "epoch": 7.907072368421053, "grad_norm": 0.020919961854815483, "learning_rate": 2.097861842105263e-06, "loss": 0.0099, "step": 19230 }, { "epoch": 7.911184210526316, "grad_norm": 2.660637255758047e-05, "learning_rate": 2.09375e-06, "loss": 0.0002, "step": 19240 }, { "epoch": 7.915296052631579, "grad_norm": 0.4476158320903778, "learning_rate": 2.089638157894737e-06, "loss": 0.0502, "step": 19250 }, { "epoch": 7.9194078947368425, "grad_norm": 0.040128808468580246, "learning_rate": 2.085526315789474e-06, "loss": 0.001, "step": 19260 }, { "epoch": 7.923519736842105, "grad_norm": 0.007745147682726383, "learning_rate": 2.0814144736842107e-06, "loss": 0.0959, "step": 19270 }, { "epoch": 7.927631578947368, "grad_norm": 0.05349845439195633, "learning_rate": 2.0773026315789474e-06, "loss": 0.0123, "step": 19280 }, { "epoch": 7.931743421052632, "grad_norm": 3.0623538494110107, "learning_rate": 2.0731907894736844e-06, "loss": 0.0075, "step": 19290 }, { "epoch": 7.935855263157895, "grad_norm": 0.04638973996043205, "learning_rate": 2.069078947368421e-06, "loss": 0.0008, "step": 19300 }, { "epoch": 7.9399671052631575, "grad_norm": 0.6016738414764404, "learning_rate": 2.064967105263158e-06, "loss": 0.0009, "step": 19310 }, { "epoch": 7.944078947368421, "grad_norm": 7.008052349090576, "learning_rate": 2.060855263157895e-06, "loss": 0.0126, "step": 19320 }, { "epoch": 7.948190789473684, "grad_norm": 1.100488305091858, "learning_rate": 2.0567434210526317e-06, "loss": 0.0007, "step": 19330 }, { "epoch": 7.952302631578947, "grad_norm": 0.02694018743932247, "learning_rate": 2.0526315789473687e-06, "loss": 0.001, "step": 19340 }, { "epoch": 7.956414473684211, "grad_norm": 0.015289338305592537, "learning_rate": 2.0485197368421053e-06, "loss": 0.0756, "step": 19350 }, { "epoch": 7.9605263157894735, "grad_norm": 0.0007966504781506956, "learning_rate": 2.0444078947368423e-06, "loss": 0.1056, "step": 19360 }, { "epoch": 7.964638157894737, "grad_norm": 0.19730961322784424, "learning_rate": 2.040296052631579e-06, "loss": 0.0108, "step": 19370 }, { "epoch": 7.96875, "grad_norm": 98.96208953857422, "learning_rate": 2.036184210526316e-06, "loss": 0.1418, "step": 19380 }, { "epoch": 7.972861842105263, "grad_norm": 0.01025812141597271, "learning_rate": 2.0320723684210526e-06, "loss": 0.0054, "step": 19390 }, { "epoch": 7.9769736842105265, "grad_norm": 0.009373322129249573, "learning_rate": 2.0279605263157896e-06, "loss": 0.0506, "step": 19400 }, { "epoch": 7.981085526315789, "grad_norm": 1.8693686723709106, "learning_rate": 2.0238486842105266e-06, "loss": 0.0203, "step": 19410 }, { "epoch": 7.985197368421053, "grad_norm": 0.08877656608819962, "learning_rate": 2.0197368421052633e-06, "loss": 0.0003, "step": 19420 }, { "epoch": 7.989309210526316, "grad_norm": 0.06994573026895523, "learning_rate": 2.0156250000000003e-06, "loss": 0.0022, "step": 19430 }, { "epoch": 7.993421052631579, "grad_norm": 0.05470941215753555, "learning_rate": 2.011513157894737e-06, "loss": 0.0115, "step": 19440 }, { "epoch": 7.9975328947368425, "grad_norm": 0.00017169667989946902, "learning_rate": 2.007401315789474e-06, "loss": 0.004, "step": 19450 }, { "epoch": 8.001644736842104, "grad_norm": 0.0003484611224848777, "learning_rate": 2.0032894736842105e-06, "loss": 0.1084, "step": 19460 }, { "epoch": 8.005756578947368, "grad_norm": 0.6738888621330261, "learning_rate": 1.9991776315789476e-06, "loss": 0.0307, "step": 19470 }, { "epoch": 8.009868421052632, "grad_norm": 8.662660598754883, "learning_rate": 1.9950657894736846e-06, "loss": 0.0083, "step": 19480 }, { "epoch": 8.013980263157896, "grad_norm": 0.003018350340425968, "learning_rate": 1.990953947368421e-06, "loss": 0.0003, "step": 19490 }, { "epoch": 8.018092105263158, "grad_norm": 0.002044851426035166, "learning_rate": 1.9868421052631582e-06, "loss": 0.0002, "step": 19500 }, { "epoch": 8.022203947368421, "grad_norm": 0.031014088541269302, "learning_rate": 1.982730263157895e-06, "loss": 0.0001, "step": 19510 }, { "epoch": 8.026315789473685, "grad_norm": 0.923111081123352, "learning_rate": 1.978618421052632e-06, "loss": 0.038, "step": 19520 }, { "epoch": 8.030427631578947, "grad_norm": 0.00033148875809274614, "learning_rate": 1.9745065789473685e-06, "loss": 0.0548, "step": 19530 }, { "epoch": 8.03453947368421, "grad_norm": 0.002517040353268385, "learning_rate": 1.970394736842105e-06, "loss": 0.0792, "step": 19540 }, { "epoch": 8.038651315789474, "grad_norm": 0.9935305714607239, "learning_rate": 1.966282894736842e-06, "loss": 0.0134, "step": 19550 }, { "epoch": 8.042763157894736, "grad_norm": 0.0008426306885667145, "learning_rate": 1.962171052631579e-06, "loss": 0.0069, "step": 19560 }, { "epoch": 8.046875, "grad_norm": 0.13046151399612427, "learning_rate": 1.958059210526316e-06, "loss": 0.0019, "step": 19570 }, { "epoch": 8.050986842105264, "grad_norm": 0.00037748689646832645, "learning_rate": 1.953947368421053e-06, "loss": 0.074, "step": 19580 }, { "epoch": 8.055098684210526, "grad_norm": 0.11450521647930145, "learning_rate": 1.94983552631579e-06, "loss": 0.0003, "step": 19590 }, { "epoch": 8.05921052631579, "grad_norm": 0.0015347315929830074, "learning_rate": 1.9457236842105264e-06, "loss": 0.0054, "step": 19600 }, { "epoch": 8.063322368421053, "grad_norm": 0.004102109465748072, "learning_rate": 1.941611842105263e-06, "loss": 0.001, "step": 19610 }, { "epoch": 8.067434210526315, "grad_norm": 0.041049521416425705, "learning_rate": 1.9375e-06, "loss": 0.0076, "step": 19620 }, { "epoch": 8.071546052631579, "grad_norm": 0.012618621811270714, "learning_rate": 1.933388157894737e-06, "loss": 0.0229, "step": 19630 }, { "epoch": 8.075657894736842, "grad_norm": 0.0814553052186966, "learning_rate": 1.929276315789474e-06, "loss": 0.0004, "step": 19640 }, { "epoch": 8.079769736842104, "grad_norm": 0.4970387816429138, "learning_rate": 1.9251644736842108e-06, "loss": 0.0231, "step": 19650 }, { "epoch": 8.083881578947368, "grad_norm": 0.002543308772146702, "learning_rate": 1.9210526315789474e-06, "loss": 0.0002, "step": 19660 }, { "epoch": 8.087993421052632, "grad_norm": 0.0017268903320655227, "learning_rate": 1.9169407894736844e-06, "loss": 0.0163, "step": 19670 }, { "epoch": 8.092105263157896, "grad_norm": 0.0731857493519783, "learning_rate": 1.912828947368421e-06, "loss": 0.0018, "step": 19680 }, { "epoch": 8.096217105263158, "grad_norm": 0.10746968537569046, "learning_rate": 1.908717105263158e-06, "loss": 0.1044, "step": 19690 }, { "epoch": 8.100328947368421, "grad_norm": 0.45797082781791687, "learning_rate": 1.9046052631578949e-06, "loss": 0.0738, "step": 19700 }, { "epoch": 8.104440789473685, "grad_norm": 0.005883471108973026, "learning_rate": 1.9004934210526319e-06, "loss": 0.0015, "step": 19710 }, { "epoch": 8.108552631578947, "grad_norm": 0.41784408688545227, "learning_rate": 1.8963815789473685e-06, "loss": 0.002, "step": 19720 }, { "epoch": 8.11266447368421, "grad_norm": 1.8776366710662842, "learning_rate": 1.8922697368421053e-06, "loss": 0.0038, "step": 19730 }, { "epoch": 8.116776315789474, "grad_norm": 1.0636307001113892, "learning_rate": 1.8881578947368423e-06, "loss": 0.0006, "step": 19740 }, { "epoch": 8.120888157894736, "grad_norm": 7.0736284255981445, "learning_rate": 1.884046052631579e-06, "loss": 0.0024, "step": 19750 }, { "epoch": 8.125, "grad_norm": 0.3112788498401642, "learning_rate": 1.879934210526316e-06, "loss": 0.0278, "step": 19760 }, { "epoch": 8.129111842105264, "grad_norm": 0.15028250217437744, "learning_rate": 1.8758223684210528e-06, "loss": 0.0057, "step": 19770 }, { "epoch": 8.133223684210526, "grad_norm": 0.0035646697506308556, "learning_rate": 1.8717105263157898e-06, "loss": 0.0009, "step": 19780 }, { "epoch": 8.13733552631579, "grad_norm": 27.537662506103516, "learning_rate": 1.8675986842105265e-06, "loss": 0.0144, "step": 19790 }, { "epoch": 8.141447368421053, "grad_norm": 0.49430692195892334, "learning_rate": 1.8634868421052633e-06, "loss": 0.0003, "step": 19800 }, { "epoch": 8.145559210526315, "grad_norm": 0.18590670824050903, "learning_rate": 1.8593750000000003e-06, "loss": 0.0008, "step": 19810 }, { "epoch": 8.149671052631579, "grad_norm": 0.012526645325124264, "learning_rate": 1.855263157894737e-06, "loss": 0.0032, "step": 19820 }, { "epoch": 8.153782894736842, "grad_norm": 0.0013973083114251494, "learning_rate": 1.851151315789474e-06, "loss": 0.0001, "step": 19830 }, { "epoch": 8.157894736842104, "grad_norm": 0.0001499800564488396, "learning_rate": 1.8470394736842108e-06, "loss": 0.0018, "step": 19840 }, { "epoch": 8.162006578947368, "grad_norm": 0.009103098884224892, "learning_rate": 1.8429276315789474e-06, "loss": 0.0002, "step": 19850 }, { "epoch": 8.166118421052632, "grad_norm": 0.07066304981708527, "learning_rate": 1.8388157894736844e-06, "loss": 0.0012, "step": 19860 }, { "epoch": 8.170230263157896, "grad_norm": 0.0303801316767931, "learning_rate": 1.834703947368421e-06, "loss": 0.0002, "step": 19870 }, { "epoch": 8.174342105263158, "grad_norm": 3.0752358436584473, "learning_rate": 1.830592105263158e-06, "loss": 0.5168, "step": 19880 }, { "epoch": 8.178453947368421, "grad_norm": 0.01663016527891159, "learning_rate": 1.8264802631578949e-06, "loss": 0.0171, "step": 19890 }, { "epoch": 8.182565789473685, "grad_norm": 0.00019705640443135053, "learning_rate": 1.822368421052632e-06, "loss": 0.1079, "step": 19900 }, { "epoch": 8.186677631578947, "grad_norm": 107.32159423828125, "learning_rate": 1.8182565789473685e-06, "loss": 0.1808, "step": 19910 }, { "epoch": 8.19078947368421, "grad_norm": 0.07139640301465988, "learning_rate": 1.8141447368421053e-06, "loss": 0.0005, "step": 19920 }, { "epoch": 8.194901315789474, "grad_norm": 0.007935700938105583, "learning_rate": 1.8100328947368424e-06, "loss": 0.0021, "step": 19930 }, { "epoch": 8.199013157894736, "grad_norm": 0.6481422781944275, "learning_rate": 1.805921052631579e-06, "loss": 0.0008, "step": 19940 }, { "epoch": 8.203125, "grad_norm": 24.086261749267578, "learning_rate": 1.801809210526316e-06, "loss": 0.0123, "step": 19950 }, { "epoch": 8.207236842105264, "grad_norm": 0.04818257689476013, "learning_rate": 1.7976973684210528e-06, "loss": 0.0459, "step": 19960 }, { "epoch": 8.211348684210526, "grad_norm": 0.0009188010590150952, "learning_rate": 1.7935855263157898e-06, "loss": 0.0034, "step": 19970 }, { "epoch": 8.21546052631579, "grad_norm": 0.01342172920703888, "learning_rate": 1.7894736842105265e-06, "loss": 0.0011, "step": 19980 }, { "epoch": 8.219572368421053, "grad_norm": 0.01993461884558201, "learning_rate": 1.7853618421052633e-06, "loss": 0.0011, "step": 19990 }, { "epoch": 8.223684210526315, "grad_norm": 0.0007997810025699437, "learning_rate": 1.78125e-06, "loss": 0.1823, "step": 20000 }, { "epoch": 8.227796052631579, "grad_norm": 0.00039553025271743536, "learning_rate": 1.777138157894737e-06, "loss": 0.0475, "step": 20010 }, { "epoch": 8.231907894736842, "grad_norm": 0.029472004622220993, "learning_rate": 1.773026315789474e-06, "loss": 0.001, "step": 20020 }, { "epoch": 8.236019736842104, "grad_norm": 0.13089366257190704, "learning_rate": 1.7689144736842106e-06, "loss": 0.0053, "step": 20030 }, { "epoch": 8.240131578947368, "grad_norm": 0.07195531576871872, "learning_rate": 1.7648026315789474e-06, "loss": 0.0747, "step": 20040 }, { "epoch": 8.244243421052632, "grad_norm": 0.00015038340643513948, "learning_rate": 1.7606907894736844e-06, "loss": 0.0335, "step": 20050 }, { "epoch": 8.248355263157896, "grad_norm": 0.014339683577418327, "learning_rate": 1.756578947368421e-06, "loss": 0.0147, "step": 20060 }, { "epoch": 8.252467105263158, "grad_norm": 0.04160633683204651, "learning_rate": 1.752467105263158e-06, "loss": 0.0161, "step": 20070 }, { "epoch": 8.256578947368421, "grad_norm": 0.09322536736726761, "learning_rate": 1.7483552631578949e-06, "loss": 0.0025, "step": 20080 }, { "epoch": 8.260690789473685, "grad_norm": 0.003090464510023594, "learning_rate": 1.744243421052632e-06, "loss": 0.0136, "step": 20090 }, { "epoch": 8.264802631578947, "grad_norm": 0.0023041388485580683, "learning_rate": 1.7401315789473685e-06, "loss": 0.0058, "step": 20100 }, { "epoch": 8.26891447368421, "grad_norm": 0.02266848273575306, "learning_rate": 1.7360197368421053e-06, "loss": 0.0002, "step": 20110 }, { "epoch": 8.273026315789474, "grad_norm": 0.042311668395996094, "learning_rate": 1.7319078947368424e-06, "loss": 0.0184, "step": 20120 }, { "epoch": 8.277138157894736, "grad_norm": 0.032821740955114365, "learning_rate": 1.727796052631579e-06, "loss": 0.0104, "step": 20130 }, { "epoch": 8.28125, "grad_norm": 4.365161418914795, "learning_rate": 1.723684210526316e-06, "loss": 0.0067, "step": 20140 }, { "epoch": 8.285361842105264, "grad_norm": 0.2567031979560852, "learning_rate": 1.7195723684210528e-06, "loss": 0.0941, "step": 20150 }, { "epoch": 8.289473684210526, "grad_norm": 0.00967695377767086, "learning_rate": 1.7154605263157896e-06, "loss": 0.001, "step": 20160 }, { "epoch": 8.29358552631579, "grad_norm": 0.005732494406402111, "learning_rate": 1.7113486842105265e-06, "loss": 0.1364, "step": 20170 }, { "epoch": 8.297697368421053, "grad_norm": 1.0150928497314453, "learning_rate": 1.7072368421052633e-06, "loss": 0.1152, "step": 20180 }, { "epoch": 8.301809210526315, "grad_norm": 141.66786193847656, "learning_rate": 1.703125e-06, "loss": 0.12, "step": 20190 }, { "epoch": 8.305921052631579, "grad_norm": 7.221472263336182, "learning_rate": 1.699013157894737e-06, "loss": 0.0027, "step": 20200 }, { "epoch": 8.310032894736842, "grad_norm": 22.781036376953125, "learning_rate": 1.694901315789474e-06, "loss": 0.0124, "step": 20210 }, { "epoch": 8.314144736842104, "grad_norm": 0.007851810194551945, "learning_rate": 1.6907894736842106e-06, "loss": 0.0007, "step": 20220 }, { "epoch": 8.318256578947368, "grad_norm": 0.5739917755126953, "learning_rate": 1.6866776315789474e-06, "loss": 0.0041, "step": 20230 }, { "epoch": 8.322368421052632, "grad_norm": 1.0938937664031982, "learning_rate": 1.6825657894736844e-06, "loss": 0.156, "step": 20240 }, { "epoch": 8.326480263157896, "grad_norm": 0.008032879792153835, "learning_rate": 1.678453947368421e-06, "loss": 0.0599, "step": 20250 }, { "epoch": 8.330592105263158, "grad_norm": 0.003187471767887473, "learning_rate": 1.674342105263158e-06, "loss": 0.0016, "step": 20260 }, { "epoch": 8.334703947368421, "grad_norm": 1.5649436712265015, "learning_rate": 1.6702302631578949e-06, "loss": 0.0017, "step": 20270 }, { "epoch": 8.338815789473685, "grad_norm": 3.473937511444092, "learning_rate": 1.666118421052632e-06, "loss": 0.0016, "step": 20280 }, { "epoch": 8.342927631578947, "grad_norm": 0.0011312034912407398, "learning_rate": 1.6620065789473685e-06, "loss": 0.1365, "step": 20290 }, { "epoch": 8.34703947368421, "grad_norm": 0.022491563111543655, "learning_rate": 1.6578947368421053e-06, "loss": 0.0005, "step": 20300 }, { "epoch": 8.351151315789474, "grad_norm": 0.01682455465197563, "learning_rate": 1.6537828947368424e-06, "loss": 0.1908, "step": 20310 }, { "epoch": 8.355263157894736, "grad_norm": 0.15820525586605072, "learning_rate": 1.649671052631579e-06, "loss": 0.0302, "step": 20320 }, { "epoch": 8.359375, "grad_norm": 0.035599756985902786, "learning_rate": 1.645559210526316e-06, "loss": 0.0491, "step": 20330 }, { "epoch": 8.363486842105264, "grad_norm": 0.0006870443467050791, "learning_rate": 1.6414473684210528e-06, "loss": 0.0453, "step": 20340 }, { "epoch": 8.367598684210526, "grad_norm": 0.017046842724084854, "learning_rate": 1.6373355263157897e-06, "loss": 0.001, "step": 20350 }, { "epoch": 8.37171052631579, "grad_norm": 0.02666572667658329, "learning_rate": 1.6332236842105265e-06, "loss": 0.2276, "step": 20360 }, { "epoch": 8.375822368421053, "grad_norm": 2.208775520324707, "learning_rate": 1.6291118421052633e-06, "loss": 0.0019, "step": 20370 }, { "epoch": 8.379934210526315, "grad_norm": 0.1765185445547104, "learning_rate": 1.6250000000000001e-06, "loss": 0.0006, "step": 20380 }, { "epoch": 8.384046052631579, "grad_norm": 0.0014591231010854244, "learning_rate": 1.620888157894737e-06, "loss": 0.3287, "step": 20390 }, { "epoch": 8.388157894736842, "grad_norm": 0.01978524774312973, "learning_rate": 1.616776315789474e-06, "loss": 0.0006, "step": 20400 }, { "epoch": 8.392269736842104, "grad_norm": 0.1453421711921692, "learning_rate": 1.6126644736842106e-06, "loss": 0.0253, "step": 20410 }, { "epoch": 8.396381578947368, "grad_norm": 0.0024396302178502083, "learning_rate": 1.6085526315789474e-06, "loss": 0.0255, "step": 20420 }, { "epoch": 8.400493421052632, "grad_norm": 0.03476029261946678, "learning_rate": 1.6044407894736844e-06, "loss": 0.1261, "step": 20430 }, { "epoch": 8.404605263157896, "grad_norm": 0.0002798949717544019, "learning_rate": 1.600328947368421e-06, "loss": 0.0139, "step": 20440 }, { "epoch": 8.408717105263158, "grad_norm": 0.0015076054260134697, "learning_rate": 1.596217105263158e-06, "loss": 0.0027, "step": 20450 }, { "epoch": 8.412828947368421, "grad_norm": 6.669851779937744, "learning_rate": 1.5921052631578949e-06, "loss": 0.0024, "step": 20460 }, { "epoch": 8.416940789473685, "grad_norm": 0.0023337227758020163, "learning_rate": 1.587993421052632e-06, "loss": 0.0006, "step": 20470 }, { "epoch": 8.421052631578947, "grad_norm": 0.1198020726442337, "learning_rate": 1.5838815789473685e-06, "loss": 0.0007, "step": 20480 }, { "epoch": 8.42516447368421, "grad_norm": 0.021439597010612488, "learning_rate": 1.5797697368421053e-06, "loss": 0.0002, "step": 20490 }, { "epoch": 8.429276315789474, "grad_norm": 0.09605780243873596, "learning_rate": 1.5756578947368424e-06, "loss": 0.0013, "step": 20500 }, { "epoch": 8.433388157894736, "grad_norm": 0.0040932162664830685, "learning_rate": 1.571546052631579e-06, "loss": 0.0022, "step": 20510 }, { "epoch": 8.4375, "grad_norm": 0.00021242848015390337, "learning_rate": 1.567434210526316e-06, "loss": 0.0001, "step": 20520 }, { "epoch": 8.441611842105264, "grad_norm": 0.016136594116687775, "learning_rate": 1.5633223684210526e-06, "loss": 0.0033, "step": 20530 }, { "epoch": 8.445723684210526, "grad_norm": 1.1913484334945679, "learning_rate": 1.5592105263157897e-06, "loss": 0.0005, "step": 20540 }, { "epoch": 8.44983552631579, "grad_norm": 0.0010520926443859935, "learning_rate": 1.5550986842105265e-06, "loss": 0.0196, "step": 20550 }, { "epoch": 8.453947368421053, "grad_norm": 0.05620012804865837, "learning_rate": 1.550986842105263e-06, "loss": 0.222, "step": 20560 }, { "epoch": 8.458059210526315, "grad_norm": 0.35795560479164124, "learning_rate": 1.5468750000000001e-06, "loss": 0.0986, "step": 20570 }, { "epoch": 8.462171052631579, "grad_norm": 146.11349487304688, "learning_rate": 1.542763157894737e-06, "loss": 0.1901, "step": 20580 }, { "epoch": 8.466282894736842, "grad_norm": 0.03306944668292999, "learning_rate": 1.538651315789474e-06, "loss": 0.0003, "step": 20590 }, { "epoch": 8.470394736842104, "grad_norm": 0.0032949261367321014, "learning_rate": 1.5345394736842106e-06, "loss": 0.0021, "step": 20600 }, { "epoch": 8.474506578947368, "grad_norm": 0.044259537011384964, "learning_rate": 1.5304276315789474e-06, "loss": 0.0033, "step": 20610 }, { "epoch": 8.478618421052632, "grad_norm": 0.09277831763029099, "learning_rate": 1.5263157894736844e-06, "loss": 0.055, "step": 20620 }, { "epoch": 8.482730263157896, "grad_norm": 0.0008916630758903921, "learning_rate": 1.522203947368421e-06, "loss": 0.1617, "step": 20630 }, { "epoch": 8.486842105263158, "grad_norm": 1.467398762702942, "learning_rate": 1.518092105263158e-06, "loss": 0.001, "step": 20640 }, { "epoch": 8.490953947368421, "grad_norm": 0.1042017787694931, "learning_rate": 1.5139802631578949e-06, "loss": 0.0006, "step": 20650 }, { "epoch": 8.495065789473685, "grad_norm": 0.010952174663543701, "learning_rate": 1.5098684210526317e-06, "loss": 0.0084, "step": 20660 }, { "epoch": 8.499177631578947, "grad_norm": 0.05859874561429024, "learning_rate": 1.5057565789473685e-06, "loss": 0.0698, "step": 20670 }, { "epoch": 8.50328947368421, "grad_norm": 0.03631851449608803, "learning_rate": 1.5016447368421054e-06, "loss": 0.0006, "step": 20680 }, { "epoch": 8.507401315789474, "grad_norm": 0.0005391056765802205, "learning_rate": 1.4975328947368422e-06, "loss": 0.0158, "step": 20690 }, { "epoch": 8.511513157894736, "grad_norm": 0.050333429127931595, "learning_rate": 1.493421052631579e-06, "loss": 0.1081, "step": 20700 }, { "epoch": 8.515625, "grad_norm": 0.0007912613800726831, "learning_rate": 1.489309210526316e-06, "loss": 0.1647, "step": 20710 }, { "epoch": 8.519736842105264, "grad_norm": 0.006539842579513788, "learning_rate": 1.4851973684210526e-06, "loss": 0.0026, "step": 20720 }, { "epoch": 8.523848684210526, "grad_norm": 0.04584597051143646, "learning_rate": 1.4810855263157897e-06, "loss": 0.0003, "step": 20730 }, { "epoch": 8.52796052631579, "grad_norm": 0.15439699590206146, "learning_rate": 1.4769736842105265e-06, "loss": 0.0212, "step": 20740 }, { "epoch": 8.532072368421053, "grad_norm": 0.0626300796866417, "learning_rate": 1.472861842105263e-06, "loss": 0.003, "step": 20750 }, { "epoch": 8.536184210526315, "grad_norm": 33.395931243896484, "learning_rate": 1.4687500000000001e-06, "loss": 0.0136, "step": 20760 }, { "epoch": 8.540296052631579, "grad_norm": 0.22012284398078918, "learning_rate": 1.464638157894737e-06, "loss": 0.014, "step": 20770 }, { "epoch": 8.544407894736842, "grad_norm": 6.601381755899638e-05, "learning_rate": 1.460526315789474e-06, "loss": 0.0032, "step": 20780 }, { "epoch": 8.548519736842106, "grad_norm": 0.009945002384483814, "learning_rate": 1.4564144736842106e-06, "loss": 0.0742, "step": 20790 }, { "epoch": 8.552631578947368, "grad_norm": 0.006764125544577837, "learning_rate": 1.4523026315789474e-06, "loss": 0.0733, "step": 20800 }, { "epoch": 8.556743421052632, "grad_norm": 0.08154771476984024, "learning_rate": 1.4481907894736844e-06, "loss": 0.004, "step": 20810 }, { "epoch": 8.560855263157894, "grad_norm": 0.04067627713084221, "learning_rate": 1.444078947368421e-06, "loss": 0.2454, "step": 20820 }, { "epoch": 8.564967105263158, "grad_norm": 0.002863495144993067, "learning_rate": 1.439967105263158e-06, "loss": 0.0003, "step": 20830 }, { "epoch": 8.569078947368421, "grad_norm": 0.7650733590126038, "learning_rate": 1.435855263157895e-06, "loss": 0.0005, "step": 20840 }, { "epoch": 8.573190789473685, "grad_norm": 0.06450064480304718, "learning_rate": 1.4317434210526317e-06, "loss": 0.002, "step": 20850 }, { "epoch": 8.577302631578947, "grad_norm": 0.0002420953824184835, "learning_rate": 1.4276315789473685e-06, "loss": 0.0409, "step": 20860 }, { "epoch": 8.58141447368421, "grad_norm": 9.50124958762899e-05, "learning_rate": 1.4235197368421054e-06, "loss": 0.0002, "step": 20870 }, { "epoch": 8.585526315789474, "grad_norm": 0.24137666821479797, "learning_rate": 1.4194078947368422e-06, "loss": 0.0215, "step": 20880 }, { "epoch": 8.589638157894736, "grad_norm": 0.05245298892259598, "learning_rate": 1.415296052631579e-06, "loss": 0.0008, "step": 20890 }, { "epoch": 8.59375, "grad_norm": 0.0008668911759741604, "learning_rate": 1.411184210526316e-06, "loss": 0.0003, "step": 20900 }, { "epoch": 8.597861842105264, "grad_norm": 0.22282694280147552, "learning_rate": 1.4070723684210526e-06, "loss": 0.1511, "step": 20910 }, { "epoch": 8.601973684210526, "grad_norm": 25.438827514648438, "learning_rate": 1.4029605263157897e-06, "loss": 0.009, "step": 20920 }, { "epoch": 8.60608552631579, "grad_norm": 0.2636086642742157, "learning_rate": 1.3988486842105265e-06, "loss": 0.0316, "step": 20930 }, { "epoch": 8.610197368421053, "grad_norm": 2.275233507156372, "learning_rate": 1.394736842105263e-06, "loss": 0.0008, "step": 20940 }, { "epoch": 8.614309210526315, "grad_norm": 43.522274017333984, "learning_rate": 1.3906250000000001e-06, "loss": 0.0207, "step": 20950 }, { "epoch": 8.618421052631579, "grad_norm": 0.004544149152934551, "learning_rate": 1.386513157894737e-06, "loss": 0.0002, "step": 20960 }, { "epoch": 8.622532894736842, "grad_norm": 2.2549304962158203, "learning_rate": 1.382401315789474e-06, "loss": 0.1462, "step": 20970 }, { "epoch": 8.626644736842106, "grad_norm": 0.8890215754508972, "learning_rate": 1.3782894736842106e-06, "loss": 0.1217, "step": 20980 }, { "epoch": 8.630756578947368, "grad_norm": 0.0007637037779204547, "learning_rate": 1.3741776315789474e-06, "loss": 0.0372, "step": 20990 }, { "epoch": 8.634868421052632, "grad_norm": 0.01935545913875103, "learning_rate": 1.3700657894736844e-06, "loss": 0.0096, "step": 21000 }, { "epoch": 8.638980263157894, "grad_norm": 0.0277896448969841, "learning_rate": 1.365953947368421e-06, "loss": 0.0056, "step": 21010 }, { "epoch": 8.643092105263158, "grad_norm": 0.15072134137153625, "learning_rate": 1.361842105263158e-06, "loss": 0.0026, "step": 21020 }, { "epoch": 8.647203947368421, "grad_norm": 0.7178854942321777, "learning_rate": 1.357730263157895e-06, "loss": 0.1058, "step": 21030 }, { "epoch": 8.651315789473685, "grad_norm": 0.42835232615470886, "learning_rate": 1.3536184210526317e-06, "loss": 0.0004, "step": 21040 }, { "epoch": 8.655427631578947, "grad_norm": 11.795758247375488, "learning_rate": 1.3495065789473685e-06, "loss": 0.2595, "step": 21050 }, { "epoch": 8.65953947368421, "grad_norm": 0.3830679953098297, "learning_rate": 1.3453947368421052e-06, "loss": 0.0013, "step": 21060 }, { "epoch": 8.663651315789474, "grad_norm": 0.09519729763269424, "learning_rate": 1.3412828947368422e-06, "loss": 0.0049, "step": 21070 }, { "epoch": 8.667763157894736, "grad_norm": 0.0007457378087565303, "learning_rate": 1.337171052631579e-06, "loss": 0.0002, "step": 21080 }, { "epoch": 8.671875, "grad_norm": 0.0007768406067043543, "learning_rate": 1.333059210526316e-06, "loss": 0.0149, "step": 21090 }, { "epoch": 8.675986842105264, "grad_norm": 0.034300968050956726, "learning_rate": 1.3289473684210526e-06, "loss": 0.0046, "step": 21100 }, { "epoch": 8.680098684210526, "grad_norm": 0.009960854426026344, "learning_rate": 1.3248355263157897e-06, "loss": 0.0029, "step": 21110 }, { "epoch": 8.68421052631579, "grad_norm": 0.00154775264672935, "learning_rate": 1.3207236842105265e-06, "loss": 0.0327, "step": 21120 }, { "epoch": 8.688322368421053, "grad_norm": 0.113314189016819, "learning_rate": 1.3166118421052631e-06, "loss": 0.0044, "step": 21130 }, { "epoch": 8.692434210526315, "grad_norm": 0.014615275897085667, "learning_rate": 1.3125000000000001e-06, "loss": 0.0044, "step": 21140 }, { "epoch": 8.696546052631579, "grad_norm": 1.0225980281829834, "learning_rate": 1.308388157894737e-06, "loss": 0.0018, "step": 21150 }, { "epoch": 8.700657894736842, "grad_norm": 0.0005192348035052419, "learning_rate": 1.304276315789474e-06, "loss": 0.0028, "step": 21160 }, { "epoch": 8.704769736842106, "grad_norm": 21.393903732299805, "learning_rate": 1.3001644736842106e-06, "loss": 0.0061, "step": 21170 }, { "epoch": 8.708881578947368, "grad_norm": 0.007193740922957659, "learning_rate": 1.2960526315789474e-06, "loss": 0.0025, "step": 21180 }, { "epoch": 8.712993421052632, "grad_norm": 0.0013441045302897692, "learning_rate": 1.2919407894736842e-06, "loss": 0.0651, "step": 21190 }, { "epoch": 8.717105263157894, "grad_norm": 0.05820111557841301, "learning_rate": 1.287828947368421e-06, "loss": 0.0483, "step": 21200 }, { "epoch": 8.721217105263158, "grad_norm": 0.032725583761930466, "learning_rate": 1.283717105263158e-06, "loss": 0.0002, "step": 21210 }, { "epoch": 8.725328947368421, "grad_norm": 0.18405325710773468, "learning_rate": 1.2796052631578947e-06, "loss": 0.1587, "step": 21220 }, { "epoch": 8.729440789473685, "grad_norm": 0.3414626717567444, "learning_rate": 1.2754934210526317e-06, "loss": 0.0078, "step": 21230 }, { "epoch": 8.733552631578947, "grad_norm": 0.01579272747039795, "learning_rate": 1.2713815789473686e-06, "loss": 0.0097, "step": 21240 }, { "epoch": 8.73766447368421, "grad_norm": 0.008699091151356697, "learning_rate": 1.2672697368421052e-06, "loss": 0.0042, "step": 21250 }, { "epoch": 8.741776315789474, "grad_norm": 0.8667317032814026, "learning_rate": 1.2631578947368422e-06, "loss": 0.0009, "step": 21260 }, { "epoch": 8.745888157894736, "grad_norm": 1.3391830921173096, "learning_rate": 1.259046052631579e-06, "loss": 0.0064, "step": 21270 }, { "epoch": 8.75, "grad_norm": 0.04508015140891075, "learning_rate": 1.254934210526316e-06, "loss": 0.1151, "step": 21280 }, { "epoch": 8.754111842105264, "grad_norm": 48.28853988647461, "learning_rate": 1.2508223684210527e-06, "loss": 0.0162, "step": 21290 }, { "epoch": 8.758223684210526, "grad_norm": 0.0023160665296018124, "learning_rate": 1.2467105263157895e-06, "loss": 0.0003, "step": 21300 }, { "epoch": 8.76233552631579, "grad_norm": 2.6674623489379883, "learning_rate": 1.2425986842105265e-06, "loss": 0.0155, "step": 21310 }, { "epoch": 8.766447368421053, "grad_norm": 0.00609904620796442, "learning_rate": 1.2384868421052633e-06, "loss": 0.0086, "step": 21320 }, { "epoch": 8.770559210526315, "grad_norm": 0.02606067806482315, "learning_rate": 1.2343750000000001e-06, "loss": 0.003, "step": 21330 }, { "epoch": 8.774671052631579, "grad_norm": 0.06156745180487633, "learning_rate": 1.230263157894737e-06, "loss": 0.0029, "step": 21340 }, { "epoch": 8.778782894736842, "grad_norm": 0.0004447989631444216, "learning_rate": 1.2261513157894738e-06, "loss": 0.0738, "step": 21350 }, { "epoch": 8.782894736842106, "grad_norm": 0.028903665021061897, "learning_rate": 1.2220394736842106e-06, "loss": 0.0179, "step": 21360 }, { "epoch": 8.787006578947368, "grad_norm": 0.000259620719589293, "learning_rate": 1.2179276315789474e-06, "loss": 0.0001, "step": 21370 }, { "epoch": 8.791118421052632, "grad_norm": 0.00042439354001544416, "learning_rate": 1.2138157894736842e-06, "loss": 0.0192, "step": 21380 }, { "epoch": 8.795230263157894, "grad_norm": 1.8762305974960327, "learning_rate": 1.2097039473684213e-06, "loss": 0.0011, "step": 21390 }, { "epoch": 8.799342105263158, "grad_norm": 0.0003334373177494854, "learning_rate": 1.2055921052631579e-06, "loss": 0.0928, "step": 21400 }, { "epoch": 8.803453947368421, "grad_norm": 3.652294874191284, "learning_rate": 1.2014802631578947e-06, "loss": 0.1625, "step": 21410 }, { "epoch": 8.807565789473685, "grad_norm": 0.31108805537223816, "learning_rate": 1.1973684210526317e-06, "loss": 0.2264, "step": 21420 }, { "epoch": 8.811677631578947, "grad_norm": 0.024107256904244423, "learning_rate": 1.1932565789473686e-06, "loss": 0.0295, "step": 21430 }, { "epoch": 8.81578947368421, "grad_norm": 0.04832738637924194, "learning_rate": 1.1891447368421054e-06, "loss": 0.0005, "step": 21440 }, { "epoch": 8.819901315789474, "grad_norm": 0.0019105199025943875, "learning_rate": 1.1850328947368422e-06, "loss": 0.08, "step": 21450 }, { "epoch": 8.824013157894736, "grad_norm": 0.08762315660715103, "learning_rate": 1.180921052631579e-06, "loss": 0.0007, "step": 21460 }, { "epoch": 8.828125, "grad_norm": 0.7652416825294495, "learning_rate": 1.1772203947368422e-06, "loss": 0.0521, "step": 21470 }, { "epoch": 8.832236842105264, "grad_norm": 0.46437129378318787, "learning_rate": 1.173108552631579e-06, "loss": 0.0101, "step": 21480 }, { "epoch": 8.836348684210526, "grad_norm": 0.013920686207711697, "learning_rate": 1.1689967105263159e-06, "loss": 0.0024, "step": 21490 }, { "epoch": 8.84046052631579, "grad_norm": 0.11752483248710632, "learning_rate": 1.1648848684210527e-06, "loss": 0.0021, "step": 21500 }, { "epoch": 8.844572368421053, "grad_norm": 0.28659138083457947, "learning_rate": 1.1607730263157895e-06, "loss": 0.0015, "step": 21510 }, { "epoch": 8.848684210526315, "grad_norm": 0.02271687239408493, "learning_rate": 1.1566611842105265e-06, "loss": 0.0097, "step": 21520 }, { "epoch": 8.852796052631579, "grad_norm": 0.3271775543689728, "learning_rate": 1.1525493421052634e-06, "loss": 0.0306, "step": 21530 }, { "epoch": 8.856907894736842, "grad_norm": 0.0006081332103349268, "learning_rate": 1.1484375e-06, "loss": 0.0006, "step": 21540 }, { "epoch": 8.861019736842106, "grad_norm": 0.009489807300269604, "learning_rate": 1.144325657894737e-06, "loss": 0.0002, "step": 21550 }, { "epoch": 8.865131578947368, "grad_norm": 22.21544075012207, "learning_rate": 1.1402138157894738e-06, "loss": 0.0096, "step": 21560 }, { "epoch": 8.869243421052632, "grad_norm": 3.277864379924722e-05, "learning_rate": 1.1361019736842106e-06, "loss": 0.0056, "step": 21570 }, { "epoch": 8.873355263157894, "grad_norm": 0.003241215832531452, "learning_rate": 1.1319901315789475e-06, "loss": 0.0592, "step": 21580 }, { "epoch": 8.877467105263158, "grad_norm": 0.15321673452854156, "learning_rate": 1.1278782894736843e-06, "loss": 0.0005, "step": 21590 }, { "epoch": 8.881578947368421, "grad_norm": 0.0789317786693573, "learning_rate": 1.123766447368421e-06, "loss": 0.0005, "step": 21600 }, { "epoch": 8.885690789473685, "grad_norm": 70.32683563232422, "learning_rate": 1.119654605263158e-06, "loss": 0.1845, "step": 21610 }, { "epoch": 8.889802631578947, "grad_norm": 0.24425408244132996, "learning_rate": 1.1155427631578947e-06, "loss": 0.0718, "step": 21620 }, { "epoch": 8.89391447368421, "grad_norm": 0.005757387727499008, "learning_rate": 1.1114309210526318e-06, "loss": 0.0035, "step": 21630 }, { "epoch": 8.898026315789474, "grad_norm": 37.92292785644531, "learning_rate": 1.1073190789473686e-06, "loss": 0.0205, "step": 21640 }, { "epoch": 8.902138157894736, "grad_norm": 1.8165327310562134, "learning_rate": 1.1032072368421054e-06, "loss": 0.0105, "step": 21650 }, { "epoch": 8.90625, "grad_norm": 0.013413684442639351, "learning_rate": 1.0990953947368422e-06, "loss": 0.0092, "step": 21660 }, { "epoch": 8.910361842105264, "grad_norm": 0.0035779434256255627, "learning_rate": 1.094983552631579e-06, "loss": 0.0016, "step": 21670 }, { "epoch": 8.914473684210526, "grad_norm": 0.03039056807756424, "learning_rate": 1.0908717105263159e-06, "loss": 0.0226, "step": 21680 }, { "epoch": 8.91858552631579, "grad_norm": 0.0575411431491375, "learning_rate": 1.0867598684210527e-06, "loss": 0.0301, "step": 21690 }, { "epoch": 8.922697368421053, "grad_norm": 0.02829873189330101, "learning_rate": 1.0826480263157895e-06, "loss": 0.0019, "step": 21700 }, { "epoch": 8.926809210526315, "grad_norm": 0.5007913112640381, "learning_rate": 1.0785361842105265e-06, "loss": 0.0004, "step": 21710 }, { "epoch": 8.930921052631579, "grad_norm": 0.0031358154956251383, "learning_rate": 1.0744243421052634e-06, "loss": 0.0004, "step": 21720 }, { "epoch": 8.935032894736842, "grad_norm": 0.01469179056584835, "learning_rate": 1.0703125e-06, "loss": 0.0301, "step": 21730 }, { "epoch": 8.939144736842106, "grad_norm": 28.088376998901367, "learning_rate": 1.0662006578947368e-06, "loss": 0.0274, "step": 21740 }, { "epoch": 8.943256578947368, "grad_norm": 0.07739987969398499, "learning_rate": 1.0620888157894738e-06, "loss": 0.0008, "step": 21750 }, { "epoch": 8.947368421052632, "grad_norm": 0.11320428550243378, "learning_rate": 1.0579769736842106e-06, "loss": 0.0008, "step": 21760 }, { "epoch": 8.951480263157894, "grad_norm": 0.1822323352098465, "learning_rate": 1.0538651315789475e-06, "loss": 0.0009, "step": 21770 }, { "epoch": 8.955592105263158, "grad_norm": 0.020420599728822708, "learning_rate": 1.0497532894736843e-06, "loss": 0.1801, "step": 21780 }, { "epoch": 8.959703947368421, "grad_norm": 0.0003280929522588849, "learning_rate": 1.045641447368421e-06, "loss": 0.0017, "step": 21790 }, { "epoch": 8.963815789473685, "grad_norm": 0.02803770639002323, "learning_rate": 1.041529605263158e-06, "loss": 0.0004, "step": 21800 }, { "epoch": 8.967927631578947, "grad_norm": 0.03750944510102272, "learning_rate": 1.0374177631578947e-06, "loss": 0.0144, "step": 21810 }, { "epoch": 8.97203947368421, "grad_norm": 0.004547052551060915, "learning_rate": 1.0333059210526316e-06, "loss": 0.0019, "step": 21820 }, { "epoch": 8.976151315789474, "grad_norm": 0.030044863000512123, "learning_rate": 1.0291940789473686e-06, "loss": 0.0018, "step": 21830 }, { "epoch": 8.980263157894736, "grad_norm": 0.09664493799209595, "learning_rate": 1.0250822368421054e-06, "loss": 0.0016, "step": 21840 }, { "epoch": 8.984375, "grad_norm": 0.0004916651523672044, "learning_rate": 1.0209703947368422e-06, "loss": 0.0146, "step": 21850 }, { "epoch": 8.988486842105264, "grad_norm": 0.013362167403101921, "learning_rate": 1.016858552631579e-06, "loss": 0.0009, "step": 21860 }, { "epoch": 8.992598684210526, "grad_norm": 0.001229934743605554, "learning_rate": 1.0127467105263159e-06, "loss": 0.0507, "step": 21870 }, { "epoch": 8.99671052631579, "grad_norm": 0.0040124147199094296, "learning_rate": 1.0086348684210527e-06, "loss": 0.0034, "step": 21880 }, { "epoch": 9.000822368421053, "grad_norm": 0.1037806048989296, "learning_rate": 1.0045230263157895e-06, "loss": 0.0411, "step": 21890 }, { "epoch": 9.004934210526315, "grad_norm": 0.000802923459559679, "learning_rate": 1.0004111842105263e-06, "loss": 0.0003, "step": 21900 }, { "epoch": 9.009046052631579, "grad_norm": 0.000517787819262594, "learning_rate": 9.962993421052634e-07, "loss": 0.0003, "step": 21910 }, { "epoch": 9.013157894736842, "grad_norm": 0.00036574091063812375, "learning_rate": 9.921875e-07, "loss": 0.0568, "step": 21920 }, { "epoch": 9.017269736842104, "grad_norm": 2.1960201263427734, "learning_rate": 9.880756578947368e-07, "loss": 0.0348, "step": 21930 }, { "epoch": 9.021381578947368, "grad_norm": 0.002095481613650918, "learning_rate": 9.839638157894738e-07, "loss": 0.0066, "step": 21940 }, { "epoch": 9.025493421052632, "grad_norm": 0.0013046304229646921, "learning_rate": 9.798519736842106e-07, "loss": 0.0002, "step": 21950 }, { "epoch": 9.029605263157896, "grad_norm": 0.1555413454771042, "learning_rate": 9.757401315789475e-07, "loss": 0.0024, "step": 21960 }, { "epoch": 9.033717105263158, "grad_norm": 1.8907396793365479, "learning_rate": 9.716282894736843e-07, "loss": 0.001, "step": 21970 }, { "epoch": 9.037828947368421, "grad_norm": 2.040466785430908, "learning_rate": 9.675164473684211e-07, "loss": 0.0006, "step": 21980 }, { "epoch": 9.041940789473685, "grad_norm": 0.0005111195496283472, "learning_rate": 9.63404605263158e-07, "loss": 0.0011, "step": 21990 }, { "epoch": 9.046052631578947, "grad_norm": 0.012304543517529964, "learning_rate": 9.592927631578947e-07, "loss": 0.0001, "step": 22000 }, { "epoch": 9.05016447368421, "grad_norm": 0.05926978588104248, "learning_rate": 9.551809210526316e-07, "loss": 0.0003, "step": 22010 }, { "epoch": 9.054276315789474, "grad_norm": 0.0007231761119328439, "learning_rate": 9.510690789473685e-07, "loss": 0.0357, "step": 22020 }, { "epoch": 9.058388157894736, "grad_norm": 0.002011606004089117, "learning_rate": 9.469572368421054e-07, "loss": 0.0177, "step": 22030 }, { "epoch": 9.0625, "grad_norm": 0.09335087239742279, "learning_rate": 9.428453947368422e-07, "loss": 0.0051, "step": 22040 }, { "epoch": 9.066611842105264, "grad_norm": 0.008873417042195797, "learning_rate": 9.38733552631579e-07, "loss": 0.001, "step": 22050 }, { "epoch": 9.070723684210526, "grad_norm": 1.5110753774642944, "learning_rate": 9.346217105263159e-07, "loss": 0.0517, "step": 22060 }, { "epoch": 9.07483552631579, "grad_norm": 0.16701540350914001, "learning_rate": 9.305098684210527e-07, "loss": 0.0404, "step": 22070 }, { "epoch": 9.078947368421053, "grad_norm": 0.0011018244549632072, "learning_rate": 9.263980263157895e-07, "loss": 0.102, "step": 22080 }, { "epoch": 9.083059210526315, "grad_norm": 0.0009757328662090003, "learning_rate": 9.222861842105264e-07, "loss": 0.0002, "step": 22090 }, { "epoch": 9.087171052631579, "grad_norm": 0.0028861812315881252, "learning_rate": 9.181743421052633e-07, "loss": 0.0392, "step": 22100 }, { "epoch": 9.091282894736842, "grad_norm": 0.01574259251356125, "learning_rate": 9.140625e-07, "loss": 0.0057, "step": 22110 }, { "epoch": 9.095394736842104, "grad_norm": 0.04924844950437546, "learning_rate": 9.099506578947369e-07, "loss": 0.0002, "step": 22120 }, { "epoch": 9.099506578947368, "grad_norm": 28.6784725189209, "learning_rate": 9.058388157894737e-07, "loss": 0.0155, "step": 22130 }, { "epoch": 9.103618421052632, "grad_norm": 0.0016250410117208958, "learning_rate": 9.017269736842107e-07, "loss": 0.0122, "step": 22140 }, { "epoch": 9.107730263157896, "grad_norm": 0.0031066699884831905, "learning_rate": 8.976151315789475e-07, "loss": 0.0001, "step": 22150 }, { "epoch": 9.111842105263158, "grad_norm": 0.30667412281036377, "learning_rate": 8.935032894736843e-07, "loss": 0.0006, "step": 22160 }, { "epoch": 9.115953947368421, "grad_norm": 0.000589480740018189, "learning_rate": 8.89391447368421e-07, "loss": 0.0004, "step": 22170 }, { "epoch": 9.120065789473685, "grad_norm": 0.02490841969847679, "learning_rate": 8.852796052631579e-07, "loss": 0.0282, "step": 22180 }, { "epoch": 9.124177631578947, "grad_norm": 0.004660974722355604, "learning_rate": 8.811677631578948e-07, "loss": 0.0033, "step": 22190 }, { "epoch": 9.12828947368421, "grad_norm": 0.15241557359695435, "learning_rate": 8.770559210526317e-07, "loss": 0.0019, "step": 22200 }, { "epoch": 9.132401315789474, "grad_norm": 0.08407646417617798, "learning_rate": 8.729440789473685e-07, "loss": 0.008, "step": 22210 }, { "epoch": 9.136513157894736, "grad_norm": 0.28385263681411743, "learning_rate": 8.688322368421054e-07, "loss": 0.0022, "step": 22220 }, { "epoch": 9.140625, "grad_norm": 0.00033503732993267477, "learning_rate": 8.647203947368422e-07, "loss": 0.0004, "step": 22230 }, { "epoch": 9.144736842105264, "grad_norm": 0.01926404982805252, "learning_rate": 8.60608552631579e-07, "loss": 0.0566, "step": 22240 }, { "epoch": 9.148848684210526, "grad_norm": 26.195524215698242, "learning_rate": 8.564967105263158e-07, "loss": 0.0118, "step": 22250 }, { "epoch": 9.15296052631579, "grad_norm": 0.1406288743019104, "learning_rate": 8.523848684210527e-07, "loss": 0.3019, "step": 22260 }, { "epoch": 9.157072368421053, "grad_norm": 0.5480867028236389, "learning_rate": 8.482730263157895e-07, "loss": 0.0158, "step": 22270 }, { "epoch": 9.161184210526315, "grad_norm": 0.13859370350837708, "learning_rate": 8.441611842105265e-07, "loss": 0.0009, "step": 22280 }, { "epoch": 9.165296052631579, "grad_norm": 0.0005784053937532008, "learning_rate": 8.400493421052633e-07, "loss": 0.0032, "step": 22290 }, { "epoch": 9.169407894736842, "grad_norm": 8.995039939880371, "learning_rate": 8.359375e-07, "loss": 0.0035, "step": 22300 }, { "epoch": 9.173519736842104, "grad_norm": 0.01929420232772827, "learning_rate": 8.318256578947369e-07, "loss": 0.0166, "step": 22310 }, { "epoch": 9.177631578947368, "grad_norm": 0.20890386402606964, "learning_rate": 8.277138157894737e-07, "loss": 0.0017, "step": 22320 }, { "epoch": 9.181743421052632, "grad_norm": 0.1153140589594841, "learning_rate": 8.236019736842106e-07, "loss": 0.2221, "step": 22330 }, { "epoch": 9.185855263157896, "grad_norm": 2.0475165843963623, "learning_rate": 8.194901315789475e-07, "loss": 0.0247, "step": 22340 }, { "epoch": 9.189967105263158, "grad_norm": 0.14061927795410156, "learning_rate": 8.153782894736843e-07, "loss": 0.0015, "step": 22350 }, { "epoch": 9.194078947368421, "grad_norm": 0.0035043880343437195, "learning_rate": 8.11266447368421e-07, "loss": 0.0017, "step": 22360 }, { "epoch": 9.198190789473685, "grad_norm": 61.89847183227539, "learning_rate": 8.071546052631579e-07, "loss": 0.0711, "step": 22370 }, { "epoch": 9.202302631578947, "grad_norm": 1.4339483976364136, "learning_rate": 8.030427631578948e-07, "loss": 0.0583, "step": 22380 }, { "epoch": 9.20641447368421, "grad_norm": 0.03647860139608383, "learning_rate": 7.989309210526317e-07, "loss": 0.0021, "step": 22390 }, { "epoch": 9.210526315789474, "grad_norm": 0.0065275998786091805, "learning_rate": 7.948190789473685e-07, "loss": 0.0, "step": 22400 }, { "epoch": 9.214638157894736, "grad_norm": 0.012942095287144184, "learning_rate": 7.907072368421053e-07, "loss": 0.0008, "step": 22410 }, { "epoch": 9.21875, "grad_norm": 0.003749582916498184, "learning_rate": 7.865953947368423e-07, "loss": 0.0407, "step": 22420 }, { "epoch": 9.222861842105264, "grad_norm": 0.019273262470960617, "learning_rate": 7.82483552631579e-07, "loss": 0.0023, "step": 22430 }, { "epoch": 9.226973684210526, "grad_norm": 0.02620495669543743, "learning_rate": 7.783717105263158e-07, "loss": 0.0824, "step": 22440 }, { "epoch": 9.23108552631579, "grad_norm": 0.13408267498016357, "learning_rate": 7.742598684210527e-07, "loss": 0.004, "step": 22450 }, { "epoch": 9.235197368421053, "grad_norm": 0.07048587501049042, "learning_rate": 7.701480263157895e-07, "loss": 0.0011, "step": 22460 }, { "epoch": 9.239309210526315, "grad_norm": 0.2926476299762726, "learning_rate": 7.660361842105265e-07, "loss": 0.0032, "step": 22470 }, { "epoch": 9.243421052631579, "grad_norm": 0.004648135043680668, "learning_rate": 7.619243421052633e-07, "loss": 0.0001, "step": 22480 }, { "epoch": 9.247532894736842, "grad_norm": 0.007466164883226156, "learning_rate": 7.578125e-07, "loss": 0.0185, "step": 22490 }, { "epoch": 9.251644736842104, "grad_norm": 0.011168756522238255, "learning_rate": 7.537006578947368e-07, "loss": 0.0004, "step": 22500 }, { "epoch": 9.255756578947368, "grad_norm": 0.06408285349607468, "learning_rate": 7.495888157894737e-07, "loss": 0.0035, "step": 22510 }, { "epoch": 9.259868421052632, "grad_norm": 0.09527569264173508, "learning_rate": 7.454769736842106e-07, "loss": 0.0415, "step": 22520 }, { "epoch": 9.263980263157896, "grad_norm": 0.009627602994441986, "learning_rate": 7.413651315789475e-07, "loss": 0.0782, "step": 22530 }, { "epoch": 9.268092105263158, "grad_norm": 0.08369173854589462, "learning_rate": 7.372532894736843e-07, "loss": 0.0088, "step": 22540 }, { "epoch": 9.272203947368421, "grad_norm": 0.15961483120918274, "learning_rate": 7.33141447368421e-07, "loss": 0.0029, "step": 22550 }, { "epoch": 9.276315789473685, "grad_norm": 0.049691326916217804, "learning_rate": 7.29029605263158e-07, "loss": 0.0059, "step": 22560 }, { "epoch": 9.280427631578947, "grad_norm": 0.022236308082938194, "learning_rate": 7.249177631578948e-07, "loss": 0.0048, "step": 22570 }, { "epoch": 9.28453947368421, "grad_norm": 0.0647999495267868, "learning_rate": 7.208059210526316e-07, "loss": 0.0474, "step": 22580 }, { "epoch": 9.288651315789474, "grad_norm": 0.0038779708556830883, "learning_rate": 7.166940789473685e-07, "loss": 0.0001, "step": 22590 }, { "epoch": 9.292763157894736, "grad_norm": 0.05114118382334709, "learning_rate": 7.125822368421053e-07, "loss": 0.0007, "step": 22600 }, { "epoch": 9.296875, "grad_norm": 28.422271728515625, "learning_rate": 7.084703947368423e-07, "loss": 0.0144, "step": 22610 }, { "epoch": 9.300986842105264, "grad_norm": 0.00263079721480608, "learning_rate": 7.04358552631579e-07, "loss": 0.0036, "step": 22620 }, { "epoch": 9.305098684210526, "grad_norm": 0.002628036541864276, "learning_rate": 7.002467105263158e-07, "loss": 0.0002, "step": 22630 }, { "epoch": 9.30921052631579, "grad_norm": 0.0005118601256981492, "learning_rate": 6.961348684210527e-07, "loss": 0.0019, "step": 22640 }, { "epoch": 9.313322368421053, "grad_norm": 0.04837176576256752, "learning_rate": 6.920230263157895e-07, "loss": 0.0003, "step": 22650 }, { "epoch": 9.317434210526315, "grad_norm": 0.004665362648665905, "learning_rate": 6.879111842105264e-07, "loss": 0.0317, "step": 22660 }, { "epoch": 9.321546052631579, "grad_norm": 0.0012209968408569694, "learning_rate": 6.837993421052633e-07, "loss": 0.0001, "step": 22670 }, { "epoch": 9.325657894736842, "grad_norm": 0.07622142881155014, "learning_rate": 6.796875e-07, "loss": 0.0073, "step": 22680 }, { "epoch": 9.329769736842104, "grad_norm": 0.00021652725990861654, "learning_rate": 6.755756578947368e-07, "loss": 0.0001, "step": 22690 }, { "epoch": 9.333881578947368, "grad_norm": 0.0072752246633172035, "learning_rate": 6.714638157894738e-07, "loss": 0.0042, "step": 22700 }, { "epoch": 9.337993421052632, "grad_norm": 0.013057149946689606, "learning_rate": 6.673519736842106e-07, "loss": 0.0013, "step": 22710 }, { "epoch": 9.342105263157896, "grad_norm": 1.3909385204315186, "learning_rate": 6.632401315789475e-07, "loss": 0.1189, "step": 22720 }, { "epoch": 9.346217105263158, "grad_norm": 0.005942532327026129, "learning_rate": 6.591282894736843e-07, "loss": 0.0, "step": 22730 }, { "epoch": 9.350328947368421, "grad_norm": 0.0395037904381752, "learning_rate": 6.55016447368421e-07, "loss": 0.0002, "step": 22740 }, { "epoch": 9.354440789473685, "grad_norm": 0.003000462194904685, "learning_rate": 6.50904605263158e-07, "loss": 0.0054, "step": 22750 }, { "epoch": 9.358552631578947, "grad_norm": 0.0012729447335004807, "learning_rate": 6.467927631578948e-07, "loss": 0.0, "step": 22760 }, { "epoch": 9.36266447368421, "grad_norm": 0.07837924361228943, "learning_rate": 6.426809210526316e-07, "loss": 0.0873, "step": 22770 }, { "epoch": 9.366776315789474, "grad_norm": 0.05736375227570534, "learning_rate": 6.385690789473685e-07, "loss": 0.4035, "step": 22780 }, { "epoch": 9.370888157894736, "grad_norm": 0.004519197158515453, "learning_rate": 6.344572368421053e-07, "loss": 0.0047, "step": 22790 }, { "epoch": 9.375, "grad_norm": 1.6480873823165894, "learning_rate": 6.303453947368423e-07, "loss": 0.0017, "step": 22800 }, { "epoch": 9.379111842105264, "grad_norm": 0.0007083399686962366, "learning_rate": 6.26233552631579e-07, "loss": 0.0004, "step": 22810 }, { "epoch": 9.383223684210526, "grad_norm": 81.98469543457031, "learning_rate": 6.221217105263158e-07, "loss": 0.2035, "step": 22820 }, { "epoch": 9.38733552631579, "grad_norm": 1.910955786705017, "learning_rate": 6.180098684210527e-07, "loss": 0.0008, "step": 22830 }, { "epoch": 9.391447368421053, "grad_norm": 0.5536595582962036, "learning_rate": 6.138980263157896e-07, "loss": 0.0185, "step": 22840 }, { "epoch": 9.395559210526315, "grad_norm": 0.0020302587654441595, "learning_rate": 6.097861842105264e-07, "loss": 0.0563, "step": 22850 }, { "epoch": 9.399671052631579, "grad_norm": 0.025456195697188377, "learning_rate": 6.056743421052632e-07, "loss": 0.0001, "step": 22860 }, { "epoch": 9.403782894736842, "grad_norm": 0.043300170451402664, "learning_rate": 6.015625e-07, "loss": 0.0002, "step": 22870 }, { "epoch": 9.407894736842104, "grad_norm": 0.5046695470809937, "learning_rate": 5.974506578947368e-07, "loss": 0.0049, "step": 22880 }, { "epoch": 9.412006578947368, "grad_norm": 0.00038293583202175796, "learning_rate": 5.933388157894738e-07, "loss": 0.0003, "step": 22890 }, { "epoch": 9.416118421052632, "grad_norm": 0.027482980862259865, "learning_rate": 5.892269736842106e-07, "loss": 0.0003, "step": 22900 }, { "epoch": 9.420230263157896, "grad_norm": 0.08259539306163788, "learning_rate": 5.851151315789474e-07, "loss": 0.0004, "step": 22910 }, { "epoch": 9.424342105263158, "grad_norm": 0.002156503265723586, "learning_rate": 5.810032894736842e-07, "loss": 0.0002, "step": 22920 }, { "epoch": 9.428453947368421, "grad_norm": 0.06109001114964485, "learning_rate": 5.768914473684211e-07, "loss": 0.0004, "step": 22930 }, { "epoch": 9.432565789473685, "grad_norm": 1.3648306131362915, "learning_rate": 5.72779605263158e-07, "loss": 0.0007, "step": 22940 }, { "epoch": 9.436677631578947, "grad_norm": 0.038669224828481674, "learning_rate": 5.686677631578948e-07, "loss": 0.0002, "step": 22950 }, { "epoch": 9.44078947368421, "grad_norm": 0.06159086897969246, "learning_rate": 5.645559210526316e-07, "loss": 0.0124, "step": 22960 }, { "epoch": 9.444901315789474, "grad_norm": 1.0390788316726685, "learning_rate": 5.604440789473685e-07, "loss": 0.0027, "step": 22970 }, { "epoch": 9.449013157894736, "grad_norm": 0.0944819375872612, "learning_rate": 5.563322368421052e-07, "loss": 0.0005, "step": 22980 }, { "epoch": 9.453125, "grad_norm": 0.00038404561928473413, "learning_rate": 5.522203947368422e-07, "loss": 0.0125, "step": 22990 }, { "epoch": 9.457236842105264, "grad_norm": 0.001499609206803143, "learning_rate": 5.48108552631579e-07, "loss": 0.1189, "step": 23000 } ], "logging_steps": 10, "max_steps": 24320, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }