diff --git "a/checkpoints/lora_grounded_obj_ref_checkpoint-4896/trainer_state.json" "b/checkpoints/lora_grounded_obj_ref_checkpoint-4896/trainer_state.json" new file mode 100755--- /dev/null +++ "b/checkpoints/lora_grounded_obj_ref_checkpoint-4896/trainer_state.json" @@ -0,0 +1,39760 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0016353229762878, + "global_step": 4896, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 0.00019999999917518445, + "lm_loss": 0.59375, + "loss": 0.5391, + "step": 1, + "total_loss": 0.59375 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999999670073778, + "lm_loss": 0.51953125, + "loss": 0.4974, + "step": 2, + "total_loss": 0.51953125 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999999257666007, + "lm_loss": 0.482421875, + "loss": 0.4663, + "step": 3, + "total_loss": 0.482421875 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999998680295136, + "lm_loss": 0.359375, + "loss": 0.3953, + "step": 4, + "total_loss": 0.359375 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999997937961177, + "lm_loss": 0.333984375, + "loss": 0.3252, + "step": 5, + "total_loss": 0.333984375 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001999999703066414, + "lm_loss": 0.244140625, + "loss": 0.2549, + "step": 6, + "total_loss": 0.244140625 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999995958404037, + "lm_loss": 0.2216796875, + "loss": 0.1975, + "step": 7, + "total_loss": 0.2216796875 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999994721180893, + "lm_loss": 0.1484375, + "loss": 0.1483, + "step": 8, + "total_loss": 0.1484375 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999993318994724, + "lm_loss": 0.10595703125, + "loss": 0.1169, + "step": 9, + "total_loss": 0.10595703125 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999991751845552, + "lm_loss": 0.1162109375, + "loss": 0.0988, + "step": 10, + "total_loss": 0.1162109375 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999990019733407, + "lm_loss": 0.095703125, + "loss": 0.0734, + "step": 11, + "total_loss": 0.095703125 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019999988122658313, + "lm_loss": 0.072265625, + "loss": 0.0541, + "step": 12, + "total_loss": 0.072265625 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999986060620305, + "lm_loss": 0.060546875, + "loss": 0.0553, + "step": 13, + "total_loss": 0.060546875 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999983833619416, + "lm_loss": 0.068359375, + "loss": 0.0543, + "step": 14, + "total_loss": 0.068359375 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001999998144165568, + "lm_loss": 0.0634765625, + "loss": 0.0447, + "step": 15, + "total_loss": 0.0634765625 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001999997888472914, + "lm_loss": 0.0478515625, + "loss": 0.0422, + "step": 16, + "total_loss": 0.0478515625 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999976162839836, + "lm_loss": 0.05126953125, + "loss": 0.0398, + "step": 17, + "total_loss": 0.05126953125 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999973275987816, + "lm_loss": 0.040283203125, + "loss": 0.0423, + "step": 18, + "total_loss": 0.040283203125 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999970224173126, + "lm_loss": 0.030029296875, + "loss": 0.0366, + "step": 19, + "total_loss": 0.030029296875 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999967007395812, + "lm_loss": 0.0272216796875, + "loss": 0.0382, + "step": 20, + "total_loss": 0.0272216796875 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999963625655934, + "lm_loss": 0.0361328125, + "loss": 0.0277, + "step": 21, + "total_loss": 0.0361328125 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001999996007895354, + "lm_loss": 0.02685546875, + "loss": 0.026, + "step": 22, + "total_loss": 0.02685546875 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999956367288698, + "lm_loss": 0.034912109375, + "loss": 0.0308, + "step": 23, + "total_loss": 0.034912109375 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999952490661464, + "lm_loss": 0.0296630859375, + "loss": 0.0244, + "step": 24, + "total_loss": 0.0296630859375 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999948449071902, + "lm_loss": 0.034912109375, + "loss": 0.0246, + "step": 25, + "total_loss": 0.034912109375 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001999994424252008, + "lm_loss": 0.028564453125, + "loss": 0.0209, + "step": 26, + "total_loss": 0.028564453125 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999939871006062, + "lm_loss": 0.025390625, + "loss": 0.0219, + "step": 27, + "total_loss": 0.025390625 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001999993533452993, + "lm_loss": 0.02978515625, + "loss": 0.0246, + "step": 28, + "total_loss": 0.02978515625 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999930633091747, + "lm_loss": 0.0223388671875, + "loss": 0.0262, + "step": 29, + "total_loss": 0.0223388671875 + }, + { + "epoch": 0.01, + "learning_rate": 0.000199999257666916, + "lm_loss": 0.02001953125, + "loss": 0.0242, + "step": 30, + "total_loss": 0.02001953125 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999920735329566, + "lm_loss": 0.029296875, + "loss": 0.0214, + "step": 31, + "total_loss": 0.029296875 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999915539005731, + "lm_loss": 0.0277099609375, + "loss": 0.0207, + "step": 32, + "total_loss": 0.0277099609375 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999910177720173, + "lm_loss": 0.01953125, + "loss": 0.0215, + "step": 33, + "total_loss": 0.01953125 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001999990465147299, + "lm_loss": 0.01190185546875, + "loss": 0.0199, + "step": 34, + "total_loss": 0.01190185546875 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999898960264265, + "lm_loss": 0.01953125, + "loss": 0.0199, + "step": 35, + "total_loss": 0.01953125 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019999893104094096, + "lm_loss": 0.01263427734375, + "loss": 0.0231, + "step": 36, + "total_loss": 0.01263427734375 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001999988708296258, + "lm_loss": 0.028076171875, + "loss": 0.0206, + "step": 37, + "total_loss": 0.028076171875 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999880896869816, + "lm_loss": 0.01434326171875, + "loss": 0.0165, + "step": 38, + "total_loss": 0.01434326171875 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999874545815902, + "lm_loss": 0.0302734375, + "loss": 0.0216, + "step": 39, + "total_loss": 0.0302734375 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001999986802980095, + "lm_loss": 0.013427734375, + "loss": 0.0163, + "step": 40, + "total_loss": 0.013427734375 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999861348825063, + "lm_loss": 0.01495361328125, + "loss": 0.0152, + "step": 41, + "total_loss": 0.01495361328125 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001999985450288835, + "lm_loss": 0.011962890625, + "loss": 0.0168, + "step": 42, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999847491990926, + "lm_loss": 0.0133056640625, + "loss": 0.0179, + "step": 43, + "total_loss": 0.0133056640625 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999840316132906, + "lm_loss": 0.0164794921875, + "loss": 0.0179, + "step": 44, + "total_loss": 0.0164794921875 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001999983297531441, + "lm_loss": 0.0164794921875, + "loss": 0.0132, + "step": 45, + "total_loss": 0.0164794921875 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999825469535558, + "lm_loss": 0.01239013671875, + "loss": 0.0205, + "step": 46, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999817798796473, + "lm_loss": 0.020751953125, + "loss": 0.0177, + "step": 47, + "total_loss": 0.020751953125 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999809963097283, + "lm_loss": 0.0157470703125, + "loss": 0.0153, + "step": 48, + "total_loss": 0.0157470703125 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999801962438115, + "lm_loss": 0.01214599609375, + "loss": 0.0189, + "step": 49, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.02, + "learning_rate": 0.000199997937968191, + "lm_loss": 0.0145263671875, + "loss": 0.0141, + "step": 50, + "total_loss": 0.0145263671875 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999785466240383, + "lm_loss": 0.02490234375, + "loss": 0.0181, + "step": 51, + "total_loss": 0.02490234375 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999776970702087, + "lm_loss": 0.0084228515625, + "loss": 0.0161, + "step": 52, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999768310204362, + "lm_loss": 0.01495361328125, + "loss": 0.0173, + "step": 53, + "total_loss": 0.01495361328125 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999759484747343, + "lm_loss": 0.01531982421875, + "loss": 0.0185, + "step": 54, + "total_loss": 0.01531982421875 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999750494331187, + "lm_loss": 0.0203857421875, + "loss": 0.0156, + "step": 55, + "total_loss": 0.0203857421875 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001999974133895603, + "lm_loss": 0.031005859375, + "loss": 0.0172, + "step": 56, + "total_loss": 0.031005859375 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999732018622033, + "lm_loss": 0.0250244140625, + "loss": 0.0185, + "step": 57, + "total_loss": 0.0250244140625 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999722533329344, + "lm_loss": 0.01495361328125, + "loss": 0.019, + "step": 58, + "total_loss": 0.01495361328125 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999712883078118, + "lm_loss": 0.01300048828125, + "loss": 0.0147, + "step": 59, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001999970306786852, + "lm_loss": 0.0166015625, + "loss": 0.0161, + "step": 60, + "total_loss": 0.0166015625 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019999693087700707, + "lm_loss": 0.0172119140625, + "loss": 0.0145, + "step": 61, + "total_loss": 0.0172119140625 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999682942574843, + "lm_loss": 0.01251220703125, + "loss": 0.0136, + "step": 62, + "total_loss": 0.01251220703125 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999672632491102, + "lm_loss": 0.01519775390625, + "loss": 0.0161, + "step": 63, + "total_loss": 0.01519775390625 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001999966215744965, + "lm_loss": 0.0133056640625, + "loss": 0.0144, + "step": 64, + "total_loss": 0.0133056640625 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999651517450657, + "lm_loss": 0.0206298828125, + "loss": 0.0134, + "step": 65, + "total_loss": 0.0206298828125 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999640712494303, + "lm_loss": 0.0093994140625, + "loss": 0.0164, + "step": 66, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999629742580764, + "lm_loss": 0.012939453125, + "loss": 0.0164, + "step": 67, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999618607710221, + "lm_loss": 0.0130615234375, + "loss": 0.0128, + "step": 68, + "total_loss": 0.0130615234375 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001999960730788286, + "lm_loss": 0.0126953125, + "loss": 0.017, + "step": 69, + "total_loss": 0.0126953125 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999595843098864, + "lm_loss": 0.020263671875, + "loss": 0.0143, + "step": 70, + "total_loss": 0.020263671875 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999584213358423, + "lm_loss": 0.0216064453125, + "loss": 0.0151, + "step": 71, + "total_loss": 0.0216064453125 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999572418661732, + "lm_loss": 0.0289306640625, + "loss": 0.0137, + "step": 72, + "total_loss": 0.0289306640625 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001999956045900898, + "lm_loss": 0.01373291015625, + "loss": 0.0151, + "step": 73, + "total_loss": 0.01373291015625 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999548334400372, + "lm_loss": 0.006683349609375, + "loss": 0.0124, + "step": 74, + "total_loss": 0.006683349609375 + }, + { + "epoch": 0.03, + "learning_rate": 0.000199995360448361, + "lm_loss": 0.01434326171875, + "loss": 0.0153, + "step": 75, + "total_loss": 0.01434326171875 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001999952359031637, + "lm_loss": 0.006103515625, + "loss": 0.016, + "step": 76, + "total_loss": 0.006103515625 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001999951097084139, + "lm_loss": 0.0093994140625, + "loss": 0.0137, + "step": 77, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999498186411364, + "lm_loss": 0.0211181640625, + "loss": 0.0159, + "step": 78, + "total_loss": 0.0211181640625 + }, + { + "epoch": 0.03, + "learning_rate": 0.000199994852370265, + "lm_loss": 0.01080322265625, + "loss": 0.0128, + "step": 79, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999472122687024, + "lm_loss": 0.0174560546875, + "loss": 0.0133, + "step": 80, + "total_loss": 0.0174560546875 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001999945884339314, + "lm_loss": 0.01104736328125, + "loss": 0.014, + "step": 81, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999445399145074, + "lm_loss": 0.017578125, + "loss": 0.0131, + "step": 82, + "total_loss": 0.017578125 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999431789943044, + "lm_loss": 0.01025390625, + "loss": 0.0137, + "step": 83, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999418015787277, + "lm_loss": 0.00909423828125, + "loss": 0.0149, + "step": 84, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019999404076677998, + "lm_loss": 0.03271484375, + "loss": 0.0137, + "step": 85, + "total_loss": 0.03271484375 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999938997261544, + "lm_loss": 0.009033203125, + "loss": 0.0126, + "step": 86, + "total_loss": 0.009033203125 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999937570359983, + "lm_loss": 0.01458740234375, + "loss": 0.0119, + "step": 87, + "total_loss": 0.01458740234375 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999936126963141, + "lm_loss": 0.0198974609375, + "loss": 0.0159, + "step": 88, + "total_loss": 0.0198974609375 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999346670710418, + "lm_loss": 0.01336669921875, + "loss": 0.0124, + "step": 89, + "total_loss": 0.01336669921875 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999331906837085, + "lm_loss": 0.01470947265625, + "loss": 0.0139, + "step": 90, + "total_loss": 0.01470947265625 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999316978011668, + "lm_loss": 0.00653076171875, + "loss": 0.0121, + "step": 91, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999301884234406, + "lm_loss": 0.0069580078125, + "loss": 0.0141, + "step": 92, + "total_loss": 0.0069580078125 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999286625505547, + "lm_loss": 0.0047607421875, + "loss": 0.0131, + "step": 93, + "total_loss": 0.0047607421875 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999271201825343, + "lm_loss": 0.005859375, + "loss": 0.0146, + "step": 94, + "total_loss": 0.005859375 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999925561319405, + "lm_loss": 0.01434326171875, + "loss": 0.0156, + "step": 95, + "total_loss": 0.01434326171875 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999239859611932, + "lm_loss": 0.005859375, + "loss": 0.0107, + "step": 96, + "total_loss": 0.005859375 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999223941079234, + "lm_loss": 0.0047607421875, + "loss": 0.0117, + "step": 97, + "total_loss": 0.0047607421875 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999207857596232, + "lm_loss": 0.002960205078125, + "loss": 0.0113, + "step": 98, + "total_loss": 0.002960205078125 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999191609163184, + "lm_loss": 0.005615234375, + "loss": 0.0125, + "step": 99, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999917519578036, + "lm_loss": 0.01287841796875, + "loss": 0.011, + "step": 100, + "total_loss": 0.01287841796875 + }, + { + "epoch": 0.04, + "eval_lm_loss": 0.014467097818851471, + "eval_loss": 0.015070343390107155, + "eval_runtime": 44.0715, + "eval_samples_per_second": 22.69, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.014467097818851471, + "lm_loss": 0.00119781494140625, + "step": 100, + "total_loss": 0.00119781494140625 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999915861744803, + "lm_loss": 0.01519775390625, + "loss": 0.0121, + "step": 101, + "total_loss": 0.01519775390625 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999141874166472, + "lm_loss": 0.00830078125, + "loss": 0.0114, + "step": 102, + "total_loss": 0.00830078125 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999124965935954, + "lm_loss": 0.01043701171875, + "loss": 0.0139, + "step": 103, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999910789275676, + "lm_loss": 0.0177001953125, + "loss": 0.0137, + "step": 104, + "total_loss": 0.0177001953125 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999090654629174, + "lm_loss": 0.007232666015625, + "loss": 0.0102, + "step": 105, + "total_loss": 0.007232666015625 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999073251553476, + "lm_loss": 0.01513671875, + "loss": 0.0116, + "step": 106, + "total_loss": 0.01513671875 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999055683529954, + "lm_loss": 0.0103759765625, + "loss": 0.0135, + "step": 107, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.04, + "learning_rate": 0.000199990379505589, + "lm_loss": 0.020751953125, + "loss": 0.0123, + "step": 108, + "total_loss": 0.020751953125 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019999020052640601, + "lm_loss": 0.00848388671875, + "loss": 0.0119, + "step": 109, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001999900198977536, + "lm_loss": 0.021728515625, + "loss": 0.0153, + "step": 110, + "total_loss": 0.021728515625 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998983761963468, + "lm_loss": 0.0166015625, + "loss": 0.0119, + "step": 111, + "total_loss": 0.0166015625 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998965369205228, + "lm_loss": 0.005126953125, + "loss": 0.0116, + "step": 112, + "total_loss": 0.005126953125 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998946811500948, + "lm_loss": 0.01373291015625, + "loss": 0.0127, + "step": 113, + "total_loss": 0.01373291015625 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998928088850928, + "lm_loss": 0.0146484375, + "loss": 0.0137, + "step": 114, + "total_loss": 0.0146484375 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999890920125548, + "lm_loss": 0.0172119140625, + "loss": 0.0114, + "step": 115, + "total_loss": 0.0172119140625 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998890148714918, + "lm_loss": 0.006378173828125, + "loss": 0.0102, + "step": 116, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999887093122955, + "lm_loss": 0.0146484375, + "loss": 0.0132, + "step": 117, + "total_loss": 0.0146484375 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998851548799694, + "lm_loss": 0.02392578125, + "loss": 0.0114, + "step": 118, + "total_loss": 0.02392578125 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998832001425672, + "lm_loss": 0.02294921875, + "loss": 0.0121, + "step": 119, + "total_loss": 0.02294921875 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999881228910781, + "lm_loss": 0.0113525390625, + "loss": 0.0116, + "step": 120, + "total_loss": 0.0113525390625 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998792411846425, + "lm_loss": 0.005615234375, + "loss": 0.0104, + "step": 121, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998772369641851, + "lm_loss": 0.01043701171875, + "loss": 0.0103, + "step": 122, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999875216249442, + "lm_loss": 0.0145263671875, + "loss": 0.0137, + "step": 123, + "total_loss": 0.0145263671875 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998731790404458, + "lm_loss": 0.019775390625, + "loss": 0.0137, + "step": 124, + "total_loss": 0.019775390625 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998711253372306, + "lm_loss": 0.00323486328125, + "loss": 0.0121, + "step": 125, + "total_loss": 0.00323486328125 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998690551398306, + "lm_loss": 0.0081787109375, + "loss": 0.0132, + "step": 126, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998669684482791, + "lm_loss": 0.009521484375, + "loss": 0.0116, + "step": 127, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999864865262611, + "lm_loss": 0.00958251953125, + "loss": 0.0113, + "step": 128, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998627455828613, + "lm_loss": 0.00921630859375, + "loss": 0.0095, + "step": 129, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998606094090646, + "lm_loss": 0.018798828125, + "loss": 0.0134, + "step": 130, + "total_loss": 0.018798828125 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999858456741256, + "lm_loss": 0.0238037109375, + "loss": 0.0128, + "step": 131, + "total_loss": 0.0238037109375 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998562875794712, + "lm_loss": 0.01220703125, + "loss": 0.0136, + "step": 132, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001999854101923746, + "lm_loss": 0.0111083984375, + "loss": 0.0109, + "step": 133, + "total_loss": 0.0111083984375 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019998518997741167, + "lm_loss": 0.011962890625, + "loss": 0.0132, + "step": 134, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001999849681130619, + "lm_loss": 0.0111083984375, + "loss": 0.0127, + "step": 135, + "total_loss": 0.0111083984375 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998474459932903, + "lm_loss": 0.01446533203125, + "loss": 0.0124, + "step": 136, + "total_loss": 0.01446533203125 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998451943621664, + "lm_loss": 0.0164794921875, + "loss": 0.0119, + "step": 137, + "total_loss": 0.0164794921875 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998429262372858, + "lm_loss": 0.005645751953125, + "loss": 0.012, + "step": 138, + "total_loss": 0.005645751953125 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998406416186847, + "lm_loss": 0.01458740234375, + "loss": 0.011, + "step": 139, + "total_loss": 0.01458740234375 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998383405064014, + "lm_loss": 0.019287109375, + "loss": 0.0117, + "step": 140, + "total_loss": 0.019287109375 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998360229004738, + "lm_loss": 0.01544189453125, + "loss": 0.0112, + "step": 141, + "total_loss": 0.01544189453125 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998336888009402, + "lm_loss": 0.00970458984375, + "loss": 0.0133, + "step": 142, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998313382078387, + "lm_loss": 0.0108642578125, + "loss": 0.0112, + "step": 143, + "total_loss": 0.0108642578125 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998289711212083, + "lm_loss": 0.0191650390625, + "loss": 0.01, + "step": 144, + "total_loss": 0.0191650390625 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998265875410885, + "lm_loss": 0.01556396484375, + "loss": 0.0122, + "step": 145, + "total_loss": 0.01556396484375 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001999824187467518, + "lm_loss": 0.00921630859375, + "loss": 0.0124, + "step": 146, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998217709005366, + "lm_loss": 0.01409912109375, + "loss": 0.0124, + "step": 147, + "total_loss": 0.01409912109375 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998193378401843, + "lm_loss": 0.0201416015625, + "loss": 0.0153, + "step": 148, + "total_loss": 0.0201416015625 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001999816888286501, + "lm_loss": 0.00885009765625, + "loss": 0.0099, + "step": 149, + "total_loss": 0.00885009765625 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998144222395272, + "lm_loss": 0.006866455078125, + "loss": 0.0141, + "step": 150, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998119396993035, + "lm_loss": 0.005706787109375, + "loss": 0.0117, + "step": 151, + "total_loss": 0.005706787109375 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998094406658713, + "lm_loss": 0.0084228515625, + "loss": 0.0104, + "step": 152, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998069251392714, + "lm_loss": 0.007415771484375, + "loss": 0.0125, + "step": 153, + "total_loss": 0.007415771484375 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001999804393119545, + "lm_loss": 0.01068115234375, + "loss": 0.0115, + "step": 154, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019998018446067345, + "lm_loss": 0.01007080078125, + "loss": 0.012, + "step": 155, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001999799279600882, + "lm_loss": 0.0157470703125, + "loss": 0.0115, + "step": 156, + "total_loss": 0.0157470703125 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001999796698102029, + "lm_loss": 0.010009765625, + "loss": 0.0114, + "step": 157, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019997941001102188, + "lm_loss": 0.0146484375, + "loss": 0.0113, + "step": 158, + "total_loss": 0.0146484375 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997914856254942, + "lm_loss": 0.01165771484375, + "loss": 0.0115, + "step": 159, + "total_loss": 0.01165771484375 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001999788854647898, + "lm_loss": 0.00921630859375, + "loss": 0.0128, + "step": 160, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997862071774737, + "lm_loss": 0.00732421875, + "loss": 0.0104, + "step": 161, + "total_loss": 0.00732421875 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997835432142654, + "lm_loss": 0.01177978515625, + "loss": 0.0117, + "step": 162, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997808627583163, + "lm_loss": 0.0115966796875, + "loss": 0.0128, + "step": 163, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001999778165809671, + "lm_loss": 0.00933837890625, + "loss": 0.0099, + "step": 164, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997754523683745, + "lm_loss": 0.0054931640625, + "loss": 0.0117, + "step": 165, + "total_loss": 0.0054931640625 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997727224344708, + "lm_loss": 0.014892578125, + "loss": 0.0124, + "step": 166, + "total_loss": 0.014892578125 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997699760080048, + "lm_loss": 0.01123046875, + "loss": 0.0111, + "step": 167, + "total_loss": 0.01123046875 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001999767213089023, + "lm_loss": 0.01312255859375, + "loss": 0.0114, + "step": 168, + "total_loss": 0.01312255859375 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997644336775693, + "lm_loss": 0.009521484375, + "loss": 0.014, + "step": 169, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001999761637773691, + "lm_loss": 0.0115966796875, + "loss": 0.011, + "step": 170, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997588253774335, + "lm_loss": 0.007568359375, + "loss": 0.0119, + "step": 171, + "total_loss": 0.007568359375 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997559964888434, + "lm_loss": 0.0240478515625, + "loss": 0.0146, + "step": 172, + "total_loss": 0.0240478515625 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997531511079672, + "lm_loss": 0.01220703125, + "loss": 0.012, + "step": 173, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997502892348518, + "lm_loss": 0.00372314453125, + "loss": 0.012, + "step": 174, + "total_loss": 0.00372314453125 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997474108695448, + "lm_loss": 0.0089111328125, + "loss": 0.0096, + "step": 175, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997445160120937, + "lm_loss": 0.004119873046875, + "loss": 0.0093, + "step": 176, + "total_loss": 0.004119873046875 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997416046625458, + "lm_loss": 0.00811767578125, + "loss": 0.0111, + "step": 177, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997386768209492, + "lm_loss": 0.016845703125, + "loss": 0.014, + "step": 178, + "total_loss": 0.016845703125 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997357324873526, + "lm_loss": 0.006378173828125, + "loss": 0.011, + "step": 179, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997327716618038, + "lm_loss": 0.0145263671875, + "loss": 0.0136, + "step": 180, + "total_loss": 0.0145263671875 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997297943443525, + "lm_loss": 0.015625, + "loss": 0.0134, + "step": 181, + "total_loss": 0.015625 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997268005350472, + "lm_loss": 0.01202392578125, + "loss": 0.013, + "step": 182, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019997237902339378, + "lm_loss": 0.006744384765625, + "loss": 0.0117, + "step": 183, + "total_loss": 0.006744384765625 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019997207634410736, + "lm_loss": 0.01177978515625, + "loss": 0.0099, + "step": 184, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019997177201565048, + "lm_loss": 0.01171875, + "loss": 0.0136, + "step": 185, + "total_loss": 0.01171875 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019997146603802812, + "lm_loss": 0.01226806640625, + "loss": 0.0118, + "step": 186, + "total_loss": 0.01226806640625 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019997115841124537, + "lm_loss": 0.01226806640625, + "loss": 0.0118, + "step": 187, + "total_loss": 0.01226806640625 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019997084913530726, + "lm_loss": 0.00970458984375, + "loss": 0.0124, + "step": 188, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019997053821021893, + "lm_loss": 0.0091552734375, + "loss": 0.011, + "step": 189, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001999702256359855, + "lm_loss": 0.00665283203125, + "loss": 0.01, + "step": 190, + "total_loss": 0.00665283203125 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001999699114126121, + "lm_loss": 0.007598876953125, + "loss": 0.0115, + "step": 191, + "total_loss": 0.007598876953125 + }, + { + "epoch": 0.08, + "learning_rate": 0.000199969595540104, + "lm_loss": 0.006805419921875, + "loss": 0.0117, + "step": 192, + "total_loss": 0.006805419921875 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001999692780184663, + "lm_loss": 0.005584716796875, + "loss": 0.0108, + "step": 193, + "total_loss": 0.005584716796875 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996895884770427, + "lm_loss": 0.01019287109375, + "loss": 0.0105, + "step": 194, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996863802782322, + "lm_loss": 0.01202392578125, + "loss": 0.0092, + "step": 195, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001999683155588284, + "lm_loss": 0.01019287109375, + "loss": 0.0126, + "step": 196, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996799144072515, + "lm_loss": 0.0031890869140625, + "loss": 0.0087, + "step": 197, + "total_loss": 0.0031890869140625 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001999676656735188, + "lm_loss": 0.0174560546875, + "loss": 0.0104, + "step": 198, + "total_loss": 0.0174560546875 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996733825721475, + "lm_loss": 0.0152587890625, + "loss": 0.0099, + "step": 199, + "total_loss": 0.0152587890625 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001999670091918184, + "lm_loss": 0.0186767578125, + "loss": 0.0124, + "step": 200, + "total_loss": 0.0186767578125 + }, + { + "epoch": 0.08, + "eval_lm_loss": 0.013095361180603504, + "eval_loss": 0.013629074208438396, + "eval_runtime": 43.92, + "eval_samples_per_second": 22.769, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.013095361180603504, + "lm_loss": 0.0010986328125, + "step": 200, + "total_loss": 0.0010986328125 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996667847733512, + "lm_loss": 0.005615234375, + "loss": 0.0107, + "step": 201, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996634611377042, + "lm_loss": 0.01007080078125, + "loss": 0.0107, + "step": 202, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996601210112974, + "lm_loss": 0.00860595703125, + "loss": 0.0105, + "step": 203, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001999656764394187, + "lm_loss": 0.00445556640625, + "loss": 0.0114, + "step": 204, + "total_loss": 0.00445556640625 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996533912864268, + "lm_loss": 0.01007080078125, + "loss": 0.0124, + "step": 205, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996500016880738, + "lm_loss": 0.0155029296875, + "loss": 0.0114, + "step": 206, + "total_loss": 0.0155029296875 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019996465955991828, + "lm_loss": 0.0079345703125, + "loss": 0.0096, + "step": 207, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001999643173019811, + "lm_loss": 0.026123046875, + "loss": 0.0127, + "step": 208, + "total_loss": 0.026123046875 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996397339500143, + "lm_loss": 0.007415771484375, + "loss": 0.0089, + "step": 209, + "total_loss": 0.007415771484375 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996362783898493, + "lm_loss": 0.01416015625, + "loss": 0.0129, + "step": 210, + "total_loss": 0.01416015625 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001999632806339373, + "lm_loss": 0.01531982421875, + "loss": 0.0101, + "step": 211, + "total_loss": 0.01531982421875 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996293177986435, + "lm_loss": 0.012939453125, + "loss": 0.0077, + "step": 212, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996258127677173, + "lm_loss": 0.0106201171875, + "loss": 0.0104, + "step": 213, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996222912466532, + "lm_loss": 0.0087890625, + "loss": 0.0099, + "step": 214, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996187532355083, + "lm_loss": 0.01336669921875, + "loss": 0.0108, + "step": 215, + "total_loss": 0.01336669921875 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996151987343414, + "lm_loss": 0.010986328125, + "loss": 0.0105, + "step": 216, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996116277432114, + "lm_loss": 0.0101318359375, + "loss": 0.0121, + "step": 217, + "total_loss": 0.0101318359375 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996080402621765, + "lm_loss": 0.01104736328125, + "loss": 0.0098, + "step": 218, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001999604436291297, + "lm_loss": 0.00860595703125, + "loss": 0.0114, + "step": 219, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019996008158306312, + "lm_loss": 0.01031494140625, + "loss": 0.0117, + "step": 220, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995971788802397, + "lm_loss": 0.01025390625, + "loss": 0.012, + "step": 221, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001999593525440182, + "lm_loss": 0.00811767578125, + "loss": 0.0128, + "step": 222, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995898555105185, + "lm_loss": 0.01397705078125, + "loss": 0.0104, + "step": 223, + "total_loss": 0.01397705078125 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995861690913093, + "lm_loss": 0.0140380859375, + "loss": 0.013, + "step": 224, + "total_loss": 0.0140380859375 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995824661826162, + "lm_loss": 0.0191650390625, + "loss": 0.0129, + "step": 225, + "total_loss": 0.0191650390625 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995787467844996, + "lm_loss": 0.0159912109375, + "loss": 0.011, + "step": 226, + "total_loss": 0.0159912109375 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995750108970208, + "lm_loss": 0.01092529296875, + "loss": 0.0112, + "step": 227, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995712585202418, + "lm_loss": 0.009521484375, + "loss": 0.0102, + "step": 228, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995674896542242, + "lm_loss": 0.006134033203125, + "loss": 0.0102, + "step": 229, + "total_loss": 0.006134033203125 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995637042990303, + "lm_loss": 0.00872802734375, + "loss": 0.0124, + "step": 230, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995599024547224, + "lm_loss": 0.00909423828125, + "loss": 0.0113, + "step": 231, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019995560841213635, + "lm_loss": 0.0072021484375, + "loss": 0.01, + "step": 232, + "total_loss": 0.0072021484375 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995522492990167, + "lm_loss": 0.00439453125, + "loss": 0.0118, + "step": 233, + "total_loss": 0.00439453125 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995483979877446, + "lm_loss": 0.007476806640625, + "loss": 0.0109, + "step": 234, + "total_loss": 0.007476806640625 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001999544530187611, + "lm_loss": 0.0067138671875, + "loss": 0.0128, + "step": 235, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.1, + "learning_rate": 0.000199954064589868, + "lm_loss": 0.010986328125, + "loss": 0.0113, + "step": 236, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995367451210156, + "lm_loss": 0.00518798828125, + "loss": 0.0113, + "step": 237, + "total_loss": 0.00518798828125 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995328278546822, + "lm_loss": 0.006011962890625, + "loss": 0.0111, + "step": 238, + "total_loss": 0.006011962890625 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995288940997442, + "lm_loss": 0.01171875, + "loss": 0.0096, + "step": 239, + "total_loss": 0.01171875 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995249438562663, + "lm_loss": 0.0177001953125, + "loss": 0.0118, + "step": 240, + "total_loss": 0.0177001953125 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001999520977124314, + "lm_loss": 0.01287841796875, + "loss": 0.0106, + "step": 241, + "total_loss": 0.01287841796875 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995169939039529, + "lm_loss": 0.012939453125, + "loss": 0.0092, + "step": 242, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995129941952482, + "lm_loss": 0.01007080078125, + "loss": 0.0105, + "step": 243, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995089779982665, + "lm_loss": 0.01708984375, + "loss": 0.0148, + "step": 244, + "total_loss": 0.01708984375 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995049453130735, + "lm_loss": 0.01043701171875, + "loss": 0.0142, + "step": 245, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019995008961397356, + "lm_loss": 0.01275634765625, + "loss": 0.0102, + "step": 246, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019994968304783205, + "lm_loss": 0.015625, + "loss": 0.0114, + "step": 247, + "total_loss": 0.015625 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019994927483288946, + "lm_loss": 0.011474609375, + "loss": 0.0109, + "step": 248, + "total_loss": 0.011474609375 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019994886496915251, + "lm_loss": 0.006011962890625, + "loss": 0.0113, + "step": 249, + "total_loss": 0.006011962890625 + }, + { + "epoch": 0.1, + "learning_rate": 0.000199948453456628, + "lm_loss": 0.007720947265625, + "loss": 0.0118, + "step": 250, + "total_loss": 0.007720947265625 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001999480402953227, + "lm_loss": 0.01251220703125, + "loss": 0.0109, + "step": 251, + "total_loss": 0.01251220703125 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019994762548524342, + "lm_loss": 0.0244140625, + "loss": 0.0129, + "step": 252, + "total_loss": 0.0244140625 + }, + { + "epoch": 0.1, + "learning_rate": 0.000199947209026397, + "lm_loss": 0.02001953125, + "loss": 0.0118, + "step": 253, + "total_loss": 0.02001953125 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019994679091879037, + "lm_loss": 0.01080322265625, + "loss": 0.0111, + "step": 254, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019994637116243033, + "lm_loss": 0.0059814453125, + "loss": 0.0107, + "step": 255, + "total_loss": 0.0059814453125 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019994594975732388, + "lm_loss": 0.01300048828125, + "loss": 0.0144, + "step": 256, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994552670347794, + "lm_loss": 0.0096435546875, + "loss": 0.0112, + "step": 257, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994510200089952, + "lm_loss": 0.006072998046875, + "loss": 0.0079, + "step": 258, + "total_loss": 0.006072998046875 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994467564959557, + "lm_loss": 0.01275634765625, + "loss": 0.0094, + "step": 259, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994424764957315, + "lm_loss": 0.0079345703125, + "loss": 0.0108, + "step": 260, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994381800083933, + "lm_loss": 0.00933837890625, + "loss": 0.0108, + "step": 261, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994338670340121, + "lm_loss": 0.00579833984375, + "loss": 0.0111, + "step": 262, + "total_loss": 0.00579833984375 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994295375726586, + "lm_loss": 0.00592041015625, + "loss": 0.0138, + "step": 263, + "total_loss": 0.00592041015625 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994251916244043, + "lm_loss": 0.0115966796875, + "loss": 0.0091, + "step": 264, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994208291893216, + "lm_loss": 0.01043701171875, + "loss": 0.0126, + "step": 265, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994164502674816, + "lm_loss": 0.007080078125, + "loss": 0.0123, + "step": 266, + "total_loss": 0.007080078125 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001999412054858957, + "lm_loss": 0.0159912109375, + "loss": 0.0139, + "step": 267, + "total_loss": 0.0159912109375 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994076429638203, + "lm_loss": 0.01556396484375, + "loss": 0.0138, + "step": 268, + "total_loss": 0.01556396484375 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001999403214582144, + "lm_loss": 0.01263427734375, + "loss": 0.0106, + "step": 269, + "total_loss": 0.01263427734375 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993987697140012, + "lm_loss": 0.01513671875, + "loss": 0.0109, + "step": 270, + "total_loss": 0.01513671875 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993943083594656, + "lm_loss": 0.00872802734375, + "loss": 0.0099, + "step": 271, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993898305186103, + "lm_loss": 0.0089111328125, + "loss": 0.0129, + "step": 272, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993853361915096, + "lm_loss": 0.0167236328125, + "loss": 0.0125, + "step": 273, + "total_loss": 0.0167236328125 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993808253782373, + "lm_loss": 0.0074462890625, + "loss": 0.011, + "step": 274, + "total_loss": 0.0074462890625 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993762980788683, + "lm_loss": 0.00927734375, + "loss": 0.01, + "step": 275, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001999371754293476, + "lm_loss": 0.0084228515625, + "loss": 0.0085, + "step": 276, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993671940221375, + "lm_loss": 0.0167236328125, + "loss": 0.0117, + "step": 277, + "total_loss": 0.0167236328125 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001999362617264926, + "lm_loss": 0.00872802734375, + "loss": 0.0106, + "step": 278, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993580240219183, + "lm_loss": 0.01507568359375, + "loss": 0.0118, + "step": 279, + "total_loss": 0.01507568359375 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993534142931894, + "lm_loss": 0.013671875, + "loss": 0.0119, + "step": 280, + "total_loss": 0.013671875 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019993487880788156, + "lm_loss": 0.0087890625, + "loss": 0.0095, + "step": 281, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019993441453788736, + "lm_loss": 0.0086669921875, + "loss": 0.0101, + "step": 282, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019993394861934392, + "lm_loss": 0.01123046875, + "loss": 0.0122, + "step": 283, + "total_loss": 0.01123046875 + }, + { + "epoch": 0.12, + "learning_rate": 0.000199933481052259, + "lm_loss": 0.00689697265625, + "loss": 0.0117, + "step": 284, + "total_loss": 0.00689697265625 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019993301183664027, + "lm_loss": 0.01220703125, + "loss": 0.0097, + "step": 285, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019993254097249547, + "lm_loss": 0.010986328125, + "loss": 0.0132, + "step": 286, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019993206845983244, + "lm_loss": 0.005767822265625, + "loss": 0.0101, + "step": 287, + "total_loss": 0.005767822265625 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019993159429865888, + "lm_loss": 0.01116943359375, + "loss": 0.0113, + "step": 288, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019993111848898263, + "lm_loss": 0.01007080078125, + "loss": 0.0098, + "step": 289, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019993064103081154, + "lm_loss": 0.012939453125, + "loss": 0.0136, + "step": 290, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019993016192415355, + "lm_loss": 0.00811767578125, + "loss": 0.0115, + "step": 291, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992968116901649, + "lm_loss": 0.0103759765625, + "loss": 0.0095, + "step": 292, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992919876540832, + "lm_loss": 0.00921630859375, + "loss": 0.0086, + "step": 293, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.12, + "learning_rate": 0.000199928714713337, + "lm_loss": 0.006500244140625, + "loss": 0.0098, + "step": 294, + "total_loss": 0.006500244140625 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992822901281052, + "lm_loss": 0.0113525390625, + "loss": 0.0115, + "step": 295, + "total_loss": 0.0113525390625 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992774166383684, + "lm_loss": 0.01177978515625, + "loss": 0.0091, + "step": 296, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001999272526664241, + "lm_loss": 0.01190185546875, + "loss": 0.0107, + "step": 297, + "total_loss": 0.01190185546875 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001999267620205803, + "lm_loss": 0.002960205078125, + "loss": 0.0096, + "step": 298, + "total_loss": 0.002960205078125 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001999262697263135, + "lm_loss": 0.006103515625, + "loss": 0.0114, + "step": 299, + "total_loss": 0.006103515625 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992577578363188, + "lm_loss": 0.00579833984375, + "loss": 0.0099, + "step": 300, + "total_loss": 0.00579833984375 + }, + { + "epoch": 0.12, + "eval_lm_loss": 0.012582213617861271, + "eval_loss": 0.013147125020623207, + "eval_runtime": 44.0148, + "eval_samples_per_second": 22.72, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.012582213617861271, + "lm_loss": 0.0034637451171875, + "step": 300, + "total_loss": 0.0034637451171875 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992528019254359, + "lm_loss": 0.01202392578125, + "loss": 0.0096, + "step": 301, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992478295305678, + "lm_loss": 0.01104736328125, + "loss": 0.0106, + "step": 302, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992428406517964, + "lm_loss": 0.015625, + "loss": 0.0097, + "step": 303, + "total_loss": 0.015625 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992378352892045, + "lm_loss": 0.006256103515625, + "loss": 0.0088, + "step": 304, + "total_loss": 0.006256103515625 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019992328134428743, + "lm_loss": 0.007659912109375, + "loss": 0.0111, + "step": 305, + "total_loss": 0.007659912109375 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019992277751128886, + "lm_loss": 0.007720947265625, + "loss": 0.0082, + "step": 306, + "total_loss": 0.007720947265625 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019992227202993306, + "lm_loss": 0.010986328125, + "loss": 0.0118, + "step": 307, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019992176490022835, + "lm_loss": 0.00616455078125, + "loss": 0.0125, + "step": 308, + "total_loss": 0.00616455078125 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019992125612218315, + "lm_loss": 0.0230712890625, + "loss": 0.0107, + "step": 309, + "total_loss": 0.0230712890625 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001999207456958058, + "lm_loss": 0.01141357421875, + "loss": 0.0093, + "step": 310, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019992023362110474, + "lm_loss": 0.0146484375, + "loss": 0.011, + "step": 311, + "total_loss": 0.0146484375 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991971989808844, + "lm_loss": 0.008544921875, + "loss": 0.0102, + "step": 312, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991920452676535, + "lm_loss": 0.01092529296875, + "loss": 0.0126, + "step": 313, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991868750714395, + "lm_loss": 0.0074462890625, + "loss": 0.011, + "step": 314, + "total_loss": 0.0074462890625 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991816883923282, + "lm_loss": 0.010986328125, + "loss": 0.0093, + "step": 315, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991764852304048, + "lm_loss": 0.0115966796875, + "loss": 0.0091, + "step": 316, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991712655857547, + "lm_loss": 0.0087890625, + "loss": 0.0112, + "step": 317, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001999166029458465, + "lm_loss": 0.00567626953125, + "loss": 0.0118, + "step": 318, + "total_loss": 0.00567626953125 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001999160776848622, + "lm_loss": 0.00982666015625, + "loss": 0.0096, + "step": 319, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001999155507756311, + "lm_loss": 0.016845703125, + "loss": 0.0093, + "step": 320, + "total_loss": 0.016845703125 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991502221816204, + "lm_loss": 0.005706787109375, + "loss": 0.0096, + "step": 321, + "total_loss": 0.005706787109375 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991449201246369, + "lm_loss": 0.01300048828125, + "loss": 0.0105, + "step": 322, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001999139601585448, + "lm_loss": 0.01116943359375, + "loss": 0.0117, + "step": 323, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991342665641408, + "lm_loss": 0.0048828125, + "loss": 0.0082, + "step": 324, + "total_loss": 0.0048828125 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991289150608044, + "lm_loss": 0.0089111328125, + "loss": 0.0125, + "step": 325, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991235470755262, + "lm_loss": 0.0106201171875, + "loss": 0.01, + "step": 326, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991181626083954, + "lm_loss": 0.006927490234375, + "loss": 0.0099, + "step": 327, + "total_loss": 0.006927490234375 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991127616595004, + "lm_loss": 0.005157470703125, + "loss": 0.0116, + "step": 328, + "total_loss": 0.005157470703125 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991073442289303, + "lm_loss": 0.01165771484375, + "loss": 0.0098, + "step": 329, + "total_loss": 0.01165771484375 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019991019103167748, + "lm_loss": 0.01422119140625, + "loss": 0.0108, + "step": 330, + "total_loss": 0.01422119140625 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001999096459923123, + "lm_loss": 0.0086669921875, + "loss": 0.0098, + "step": 331, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990909930480658, + "lm_loss": 0.01361083984375, + "loss": 0.0093, + "step": 332, + "total_loss": 0.01361083984375 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001999085509691692, + "lm_loss": 0.003814697265625, + "loss": 0.0104, + "step": 333, + "total_loss": 0.003814697265625 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990800098540932, + "lm_loss": 0.01531982421875, + "loss": 0.0093, + "step": 334, + "total_loss": 0.01531982421875 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990744935353597, + "lm_loss": 0.0057373046875, + "loss": 0.0089, + "step": 335, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001999068960735582, + "lm_loss": 0.01300048828125, + "loss": 0.0093, + "step": 336, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990634114548525, + "lm_loss": 0.0211181640625, + "loss": 0.0109, + "step": 337, + "total_loss": 0.0211181640625 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990578456932618, + "lm_loss": 0.004241943359375, + "loss": 0.0108, + "step": 338, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001999052263450902, + "lm_loss": 0.0113525390625, + "loss": 0.0101, + "step": 339, + "total_loss": 0.0113525390625 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001999046664727865, + "lm_loss": 0.01483154296875, + "loss": 0.0101, + "step": 340, + "total_loss": 0.01483154296875 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001999041049524244, + "lm_loss": 0.0125732421875, + "loss": 0.0105, + "step": 341, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990354178401308, + "lm_loss": 0.008056640625, + "loss": 0.0086, + "step": 342, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990297696756182, + "lm_loss": 0.00933837890625, + "loss": 0.0109, + "step": 343, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990241050307997, + "lm_loss": 0.0047607421875, + "loss": 0.0127, + "step": 344, + "total_loss": 0.0047607421875 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990184239057689, + "lm_loss": 0.01068115234375, + "loss": 0.0102, + "step": 345, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990127263006194, + "lm_loss": 0.01470947265625, + "loss": 0.0111, + "step": 346, + "total_loss": 0.01470947265625 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001999007012215445, + "lm_loss": 0.01129150390625, + "loss": 0.0119, + "step": 347, + "total_loss": 0.01129150390625 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019990012816503404, + "lm_loss": 0.0087890625, + "loss": 0.0102, + "step": 348, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019989955346053995, + "lm_loss": 0.01373291015625, + "loss": 0.01, + "step": 349, + "total_loss": 0.01373291015625 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019989897710807176, + "lm_loss": 0.01397705078125, + "loss": 0.0098, + "step": 350, + "total_loss": 0.01397705078125 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019989839910763892, + "lm_loss": 0.0169677734375, + "loss": 0.0115, + "step": 351, + "total_loss": 0.0169677734375 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019989781945925104, + "lm_loss": 0.007354736328125, + "loss": 0.0113, + "step": 352, + "total_loss": 0.007354736328125 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019989723816291765, + "lm_loss": 0.006591796875, + "loss": 0.0078, + "step": 353, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001998966552186483, + "lm_loss": 0.01287841796875, + "loss": 0.0121, + "step": 354, + "total_loss": 0.01287841796875 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001998960706264527, + "lm_loss": 0.007537841796875, + "loss": 0.0102, + "step": 355, + "total_loss": 0.007537841796875 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001998954843863404, + "lm_loss": 0.0191650390625, + "loss": 0.0118, + "step": 356, + "total_loss": 0.0191650390625 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001998948964983211, + "lm_loss": 0.00970458984375, + "loss": 0.01, + "step": 357, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019989430696240454, + "lm_loss": 0.0078125, + "loss": 0.0107, + "step": 358, + "total_loss": 0.0078125 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001998937157786004, + "lm_loss": 0.00872802734375, + "loss": 0.011, + "step": 359, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019989312294691844, + "lm_loss": 0.00897216796875, + "loss": 0.0112, + "step": 360, + "total_loss": 0.00897216796875 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019989252846736846, + "lm_loss": 0.005218505859375, + "loss": 0.0099, + "step": 361, + "total_loss": 0.005218505859375 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019989193233996023, + "lm_loss": 0.0103759765625, + "loss": 0.009, + "step": 362, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019989133456470358, + "lm_loss": 0.01092529296875, + "loss": 0.0109, + "step": 363, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019989073514160843, + "lm_loss": 0.01031494140625, + "loss": 0.0097, + "step": 364, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019989013407068463, + "lm_loss": 0.0086669921875, + "loss": 0.0121, + "step": 365, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988953135194207, + "lm_loss": 0.00396728515625, + "loss": 0.0097, + "step": 366, + "total_loss": 0.00396728515625 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988892698539077, + "lm_loss": 0.01348876953125, + "loss": 0.012, + "step": 367, + "total_loss": 0.01348876953125 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988832097104063, + "lm_loss": 0.006378173828125, + "loss": 0.0116, + "step": 368, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988771330890166, + "lm_loss": 0.0040283203125, + "loss": 0.0078, + "step": 369, + "total_loss": 0.0040283203125 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001998871039989839, + "lm_loss": 0.004486083984375, + "loss": 0.0091, + "step": 370, + "total_loss": 0.004486083984375 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988649304129735, + "lm_loss": 0.00494384765625, + "loss": 0.0098, + "step": 371, + "total_loss": 0.00494384765625 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001998858804358522, + "lm_loss": 0.0126953125, + "loss": 0.0106, + "step": 372, + "total_loss": 0.0126953125 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988526618265845, + "lm_loss": 0.0128173828125, + "loss": 0.0098, + "step": 373, + "total_loss": 0.0128173828125 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988465028172625, + "lm_loss": 0.012939453125, + "loss": 0.0114, + "step": 374, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988403273306582, + "lm_loss": 0.0169677734375, + "loss": 0.0125, + "step": 375, + "total_loss": 0.0169677734375 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001998834135366873, + "lm_loss": 0.010498046875, + "loss": 0.0098, + "step": 376, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988279269260087, + "lm_loss": 0.00946044921875, + "loss": 0.0111, + "step": 377, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019988217020081687, + "lm_loss": 0.00958251953125, + "loss": 0.0097, + "step": 378, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001998815460613455, + "lm_loss": 0.0167236328125, + "loss": 0.0129, + "step": 379, + "total_loss": 0.0167236328125 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019988092027419706, + "lm_loss": 0.0089111328125, + "loss": 0.012, + "step": 380, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019988029283938188, + "lm_loss": 0.010986328125, + "loss": 0.0111, + "step": 381, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987966375691033, + "lm_loss": 0.00811767578125, + "loss": 0.0096, + "step": 382, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987903302679273, + "lm_loss": 0.007049560546875, + "loss": 0.012, + "step": 383, + "total_loss": 0.007049560546875 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987840064903957, + "lm_loss": 0.00653076171875, + "loss": 0.009, + "step": 384, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987776662366122, + "lm_loss": 0.01165771484375, + "loss": 0.0095, + "step": 385, + "total_loss": 0.01165771484375 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987713095066814, + "lm_loss": 0.0050048828125, + "loss": 0.0078, + "step": 386, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987649363007086, + "lm_loss": 0.01318359375, + "loss": 0.0098, + "step": 387, + "total_loss": 0.01318359375 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987585466187983, + "lm_loss": 0.0098876953125, + "loss": 0.0115, + "step": 388, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987521404610565, + "lm_loss": 0.01446533203125, + "loss": 0.0105, + "step": 389, + "total_loss": 0.01446533203125 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987457178275887, + "lm_loss": 0.006378173828125, + "loss": 0.0122, + "step": 390, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987392787185006, + "lm_loss": 0.01361083984375, + "loss": 0.0125, + "step": 391, + "total_loss": 0.01361083984375 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987328231338988, + "lm_loss": 0.004150390625, + "loss": 0.0108, + "step": 392, + "total_loss": 0.004150390625 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987263510738893, + "lm_loss": 0.00701904296875, + "loss": 0.0091, + "step": 393, + "total_loss": 0.00701904296875 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987198625385794, + "lm_loss": 0.01104736328125, + "loss": 0.0114, + "step": 394, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001998713357528076, + "lm_loss": 0.004547119140625, + "loss": 0.0086, + "step": 395, + "total_loss": 0.004547119140625 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987068360424862, + "lm_loss": 0.0159912109375, + "loss": 0.0132, + "step": 396, + "total_loss": 0.0159912109375 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019987002980819177, + "lm_loss": 0.01409912109375, + "loss": 0.0097, + "step": 397, + "total_loss": 0.01409912109375 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019986937436464782, + "lm_loss": 0.01611328125, + "loss": 0.0117, + "step": 398, + "total_loss": 0.01611328125 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019986871727362758, + "lm_loss": 0.007415771484375, + "loss": 0.009, + "step": 399, + "total_loss": 0.007415771484375 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019986805853514198, + "lm_loss": 0.004180908203125, + "loss": 0.0089, + "step": 400, + "total_loss": 0.004180908203125 + }, + { + "epoch": 0.16, + "eval_lm_loss": 0.01214557234197855, + "eval_loss": 0.012592697516083717, + "eval_runtime": 44.0487, + "eval_samples_per_second": 22.702, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.01214557234197855, + "lm_loss": 0.0012359619140625, + "step": 400, + "total_loss": 0.0012359619140625 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019986739814920173, + "lm_loss": 0.01226806640625, + "loss": 0.0096, + "step": 401, + "total_loss": 0.01226806640625 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019986673611581786, + "lm_loss": 0.0107421875, + "loss": 0.0093, + "step": 402, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001998660724350012, + "lm_loss": 0.01068115234375, + "loss": 0.0117, + "step": 403, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019986540710676277, + "lm_loss": 0.0159912109375, + "loss": 0.0118, + "step": 404, + "total_loss": 0.0159912109375 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019986474013111352, + "lm_loss": 0.00909423828125, + "loss": 0.0104, + "step": 405, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001998640715080644, + "lm_loss": 0.01361083984375, + "loss": 0.0078, + "step": 406, + "total_loss": 0.01361083984375 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019986340123762653, + "lm_loss": 0.01220703125, + "loss": 0.0104, + "step": 407, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001998627293198109, + "lm_loss": 0.0032806396484375, + "loss": 0.0098, + "step": 408, + "total_loss": 0.0032806396484375 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019986205575462862, + "lm_loss": 0.01422119140625, + "loss": 0.0119, + "step": 409, + "total_loss": 0.01422119140625 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001998613805420908, + "lm_loss": 0.01300048828125, + "loss": 0.0118, + "step": 410, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019986070368220856, + "lm_loss": 0.01092529296875, + "loss": 0.0117, + "step": 411, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019986002517499309, + "lm_loss": 0.00921630859375, + "loss": 0.0124, + "step": 412, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985934502045558, + "lm_loss": 0.00921630859375, + "loss": 0.0093, + "step": 413, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985866321860725, + "lm_loss": 0.01019287109375, + "loss": 0.01, + "step": 414, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001998579797694593, + "lm_loss": 0.0179443359375, + "loss": 0.0092, + "step": 415, + "total_loss": 0.0179443359375 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001998572946730231, + "lm_loss": 0.00787353515625, + "loss": 0.0105, + "step": 416, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985660792930986, + "lm_loss": 0.009765625, + "loss": 0.0115, + "step": 417, + "total_loss": 0.009765625 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985591953833095, + "lm_loss": 0.0137939453125, + "loss": 0.0115, + "step": 418, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985522950009775, + "lm_loss": 0.005828857421875, + "loss": 0.0082, + "step": 419, + "total_loss": 0.005828857421875 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985453781462157, + "lm_loss": 0.00653076171875, + "loss": 0.0093, + "step": 420, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985384448191392, + "lm_loss": 0.006622314453125, + "loss": 0.0104, + "step": 421, + "total_loss": 0.006622314453125 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985314950198615, + "lm_loss": 0.006866455078125, + "loss": 0.0105, + "step": 422, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985245287484977, + "lm_loss": 0.0150146484375, + "loss": 0.0145, + "step": 423, + "total_loss": 0.0150146484375 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985175460051624, + "lm_loss": 0.020263671875, + "loss": 0.0102, + "step": 424, + "total_loss": 0.020263671875 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001998510546789971, + "lm_loss": 0.004241943359375, + "loss": 0.0129, + "step": 425, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019985035311030391, + "lm_loss": 0.00555419921875, + "loss": 0.0091, + "step": 426, + "total_loss": 0.00555419921875 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001998496498944482, + "lm_loss": 0.0093994140625, + "loss": 0.0111, + "step": 427, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001998489450314416, + "lm_loss": 0.00982666015625, + "loss": 0.0114, + "step": 428, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019984823852129576, + "lm_loss": 0.0230712890625, + "loss": 0.0115, + "step": 429, + "total_loss": 0.0230712890625 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019984753036402233, + "lm_loss": 0.0123291015625, + "loss": 0.0094, + "step": 430, + "total_loss": 0.0123291015625 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019984682055963293, + "lm_loss": 0.010009765625, + "loss": 0.0087, + "step": 431, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019984610910813935, + "lm_loss": 0.0108642578125, + "loss": 0.0085, + "step": 432, + "total_loss": 0.0108642578125 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019984539600955326, + "lm_loss": 0.0084228515625, + "loss": 0.0102, + "step": 433, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019984468126388647, + "lm_loss": 0.01165771484375, + "loss": 0.0105, + "step": 434, + "total_loss": 0.01165771484375 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001998439648711507, + "lm_loss": 0.011474609375, + "loss": 0.0083, + "step": 435, + "total_loss": 0.011474609375 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001998432468313579, + "lm_loss": 0.0096435546875, + "loss": 0.0106, + "step": 436, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019984252714451976, + "lm_loss": 0.01544189453125, + "loss": 0.0094, + "step": 437, + "total_loss": 0.01544189453125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001998418058106483, + "lm_loss": 0.005035400390625, + "loss": 0.0081, + "step": 438, + "total_loss": 0.005035400390625 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019984108282975526, + "lm_loss": 0.01007080078125, + "loss": 0.0127, + "step": 439, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001998403582018527, + "lm_loss": 0.01220703125, + "loss": 0.0098, + "step": 440, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983963192695255, + "lm_loss": 0.00946044921875, + "loss": 0.0119, + "step": 441, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983890400506672, + "lm_loss": 0.021240234375, + "loss": 0.0115, + "step": 442, + "total_loss": 0.021240234375 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983817443620732, + "lm_loss": 0.01556396484375, + "loss": 0.0115, + "step": 443, + "total_loss": 0.01556396484375 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983744322038627, + "lm_loss": 0.01153564453125, + "loss": 0.0097, + "step": 444, + "total_loss": 0.01153564453125 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983671035761572, + "lm_loss": 0.0089111328125, + "loss": 0.01, + "step": 445, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983597584790776, + "lm_loss": 0.007476806640625, + "loss": 0.0088, + "step": 446, + "total_loss": 0.007476806640625 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983523969127444, + "lm_loss": 0.01007080078125, + "loss": 0.0111, + "step": 447, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983450188772794, + "lm_loss": 0.01080322265625, + "loss": 0.0098, + "step": 448, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983376243728047, + "lm_loss": 0.0079345703125, + "loss": 0.0113, + "step": 449, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983302133994417, + "lm_loss": 0.0091552734375, + "loss": 0.0113, + "step": 450, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001998322785957313, + "lm_loss": 0.0059814453125, + "loss": 0.0088, + "step": 451, + "total_loss": 0.0059814453125 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019983153420465407, + "lm_loss": 0.01123046875, + "loss": 0.0102, + "step": 452, + "total_loss": 0.01123046875 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019983078816672483, + "lm_loss": 0.01226806640625, + "loss": 0.0105, + "step": 453, + "total_loss": 0.01226806640625 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019983004048195583, + "lm_loss": 0.005645751953125, + "loss": 0.0123, + "step": 454, + "total_loss": 0.005645751953125 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001998292911503594, + "lm_loss": 0.0194091796875, + "loss": 0.0108, + "step": 455, + "total_loss": 0.0194091796875 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019982854017194795, + "lm_loss": 0.0242919921875, + "loss": 0.0132, + "step": 456, + "total_loss": 0.0242919921875 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019982778754673382, + "lm_loss": 0.0216064453125, + "loss": 0.0095, + "step": 457, + "total_loss": 0.0216064453125 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019982703327472946, + "lm_loss": 0.01434326171875, + "loss": 0.012, + "step": 458, + "total_loss": 0.01434326171875 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001998262773559473, + "lm_loss": 0.006134033203125, + "loss": 0.0097, + "step": 459, + "total_loss": 0.006134033203125 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019982551979039981, + "lm_loss": 0.0113525390625, + "loss": 0.013, + "step": 460, + "total_loss": 0.0113525390625 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001998247605780995, + "lm_loss": 0.01239013671875, + "loss": 0.0094, + "step": 461, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019982399971905883, + "lm_loss": 0.0076904296875, + "loss": 0.0092, + "step": 462, + "total_loss": 0.0076904296875 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019982323721329045, + "lm_loss": 0.0107421875, + "loss": 0.009, + "step": 463, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019982247306080686, + "lm_loss": 0.005462646484375, + "loss": 0.0089, + "step": 464, + "total_loss": 0.005462646484375 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001998217072616207, + "lm_loss": 0.01513671875, + "loss": 0.0129, + "step": 465, + "total_loss": 0.01513671875 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019982093981574462, + "lm_loss": 0.0079345703125, + "loss": 0.0116, + "step": 466, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019982017072319125, + "lm_loss": 0.009521484375, + "loss": 0.012, + "step": 467, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019981939998397325, + "lm_loss": 0.019287109375, + "loss": 0.0111, + "step": 468, + "total_loss": 0.019287109375 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001998186275981034, + "lm_loss": 0.006317138671875, + "loss": 0.0096, + "step": 469, + "total_loss": 0.006317138671875 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019981785356559443, + "lm_loss": 0.00909423828125, + "loss": 0.0093, + "step": 470, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019981707788645906, + "lm_loss": 0.0078125, + "loss": 0.0085, + "step": 471, + "total_loss": 0.0078125 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001998163005607101, + "lm_loss": 0.01068115234375, + "loss": 0.0112, + "step": 472, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019981552158836044, + "lm_loss": 0.006500244140625, + "loss": 0.011, + "step": 473, + "total_loss": 0.006500244140625 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019981474096942284, + "lm_loss": 0.006988525390625, + "loss": 0.0097, + "step": 474, + "total_loss": 0.006988525390625 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019981395870391025, + "lm_loss": 0.01031494140625, + "loss": 0.011, + "step": 475, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001998131747918355, + "lm_loss": 0.005401611328125, + "loss": 0.0097, + "step": 476, + "total_loss": 0.005401611328125 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019981238923321158, + "lm_loss": 0.00921630859375, + "loss": 0.0106, + "step": 477, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019981160202805142, + "lm_loss": 0.00848388671875, + "loss": 0.0085, + "step": 478, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019981081317636804, + "lm_loss": 0.01226806640625, + "loss": 0.0128, + "step": 479, + "total_loss": 0.01226806640625 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019981002267817444, + "lm_loss": 0.00872802734375, + "loss": 0.0099, + "step": 480, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980923053348364, + "lm_loss": 0.00799560546875, + "loss": 0.0109, + "step": 481, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980843674230868, + "lm_loss": 0.0096435546875, + "loss": 0.0108, + "step": 482, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980764130466273, + "lm_loss": 0.01361083984375, + "loss": 0.0123, + "step": 483, + "total_loss": 0.01361083984375 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980684422055887, + "lm_loss": 0.007598876953125, + "loss": 0.0091, + "step": 484, + "total_loss": 0.007598876953125 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980604549001024, + "lm_loss": 0.007476806640625, + "loss": 0.0093, + "step": 485, + "total_loss": 0.007476806640625 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980524511303008, + "lm_loss": 0.0162353515625, + "loss": 0.0087, + "step": 486, + "total_loss": 0.0162353515625 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001998044430896315, + "lm_loss": 0.007354736328125, + "loss": 0.0107, + "step": 487, + "total_loss": 0.007354736328125 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980363941982777, + "lm_loss": 0.0064697265625, + "loss": 0.0092, + "step": 488, + "total_loss": 0.0064697265625 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980283410363215, + "lm_loss": 0.01055908203125, + "loss": 0.0102, + "step": 489, + "total_loss": 0.01055908203125 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980202714105794, + "lm_loss": 0.0084228515625, + "loss": 0.0081, + "step": 490, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980121853211842, + "lm_loss": 0.006591796875, + "loss": 0.0111, + "step": 491, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019980040827682696, + "lm_loss": 0.00604248046875, + "loss": 0.0119, + "step": 492, + "total_loss": 0.00604248046875 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019979959637519688, + "lm_loss": 0.00994873046875, + "loss": 0.0089, + "step": 493, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019979878282724167, + "lm_loss": 0.012939453125, + "loss": 0.0098, + "step": 494, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019979796763297465, + "lm_loss": 0.0125732421875, + "loss": 0.01, + "step": 495, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019979715079240933, + "lm_loss": 0.0079345703125, + "loss": 0.0081, + "step": 496, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019979633230555912, + "lm_loss": 0.01092529296875, + "loss": 0.009, + "step": 497, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019979551217243758, + "lm_loss": 0.0172119140625, + "loss": 0.0103, + "step": 498, + "total_loss": 0.0172119140625 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019979469039305823, + "lm_loss": 0.01275634765625, + "loss": 0.0092, + "step": 499, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019979386696743459, + "lm_loss": 0.0120849609375, + "loss": 0.0086, + "step": 500, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.2, + "eval_lm_loss": 0.011539776809513569, + "eval_loss": 0.012045402079820633, + "eval_runtime": 44.0051, + "eval_samples_per_second": 22.725, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.011539776809513569, + "lm_loss": 0.00072479248046875, + "step": 500, + "total_loss": 0.00072479248046875 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001997930418955803, + "lm_loss": 0.00897216796875, + "loss": 0.0106, + "step": 501, + "total_loss": 0.00897216796875 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019979221517750894, + "lm_loss": 0.01422119140625, + "loss": 0.0122, + "step": 502, + "total_loss": 0.01422119140625 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019979138681323414, + "lm_loss": 0.00537109375, + "loss": 0.0088, + "step": 503, + "total_loss": 0.00537109375 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019979055680276954, + "lm_loss": 0.01287841796875, + "loss": 0.0108, + "step": 504, + "total_loss": 0.01287841796875 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019978972514612893, + "lm_loss": 0.01531982421875, + "loss": 0.0132, + "step": 505, + "total_loss": 0.01531982421875 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019978889184332592, + "lm_loss": 0.00750732421875, + "loss": 0.0116, + "step": 506, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001997880568943743, + "lm_loss": 0.006744384765625, + "loss": 0.0091, + "step": 507, + "total_loss": 0.006744384765625 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019978722029928786, + "lm_loss": 0.007293701171875, + "loss": 0.0102, + "step": 508, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001997863820580804, + "lm_loss": 0.00872802734375, + "loss": 0.0097, + "step": 509, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019978554217076573, + "lm_loss": 0.007110595703125, + "loss": 0.0088, + "step": 510, + "total_loss": 0.007110595703125 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019978470063735767, + "lm_loss": 0.00799560546875, + "loss": 0.0099, + "step": 511, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019978385745787018, + "lm_loss": 0.01708984375, + "loss": 0.0115, + "step": 512, + "total_loss": 0.01708984375 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001997830126323171, + "lm_loss": 0.007568359375, + "loss": 0.0084, + "step": 513, + "total_loss": 0.007568359375 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019978216616071243, + "lm_loss": 0.00860595703125, + "loss": 0.0108, + "step": 514, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001997813180430701, + "lm_loss": 0.009765625, + "loss": 0.0082, + "step": 515, + "total_loss": 0.009765625 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001997804682794041, + "lm_loss": 0.0242919921875, + "loss": 0.0119, + "step": 516, + "total_loss": 0.0242919921875 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019977961686972842, + "lm_loss": 0.01025390625, + "loss": 0.0091, + "step": 517, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019977876381405713, + "lm_loss": 0.011962890625, + "loss": 0.0083, + "step": 518, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019977790911240436, + "lm_loss": 0.01129150390625, + "loss": 0.0109, + "step": 519, + "total_loss": 0.01129150390625 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001997770527647841, + "lm_loss": 0.0054931640625, + "loss": 0.0091, + "step": 520, + "total_loss": 0.0054931640625 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019977619477121056, + "lm_loss": 0.0111083984375, + "loss": 0.0101, + "step": 521, + "total_loss": 0.0111083984375 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019977533513169786, + "lm_loss": 0.005950927734375, + "loss": 0.0089, + "step": 522, + "total_loss": 0.005950927734375 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001997744738462602, + "lm_loss": 0.006744384765625, + "loss": 0.0088, + "step": 523, + "total_loss": 0.006744384765625 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019977361091491177, + "lm_loss": 0.01446533203125, + "loss": 0.0112, + "step": 524, + "total_loss": 0.01446533203125 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001997727463376668, + "lm_loss": 0.012451171875, + "loss": 0.0101, + "step": 525, + "total_loss": 0.012451171875 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019977188011453958, + "lm_loss": 0.0036773681640625, + "loss": 0.0086, + "step": 526, + "total_loss": 0.0036773681640625 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019977101224554438, + "lm_loss": 0.01519775390625, + "loss": 0.0119, + "step": 527, + "total_loss": 0.01519775390625 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001997701427306955, + "lm_loss": 0.0103759765625, + "loss": 0.0096, + "step": 528, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019976927157000734, + "lm_loss": 0.006988525390625, + "loss": 0.0084, + "step": 529, + "total_loss": 0.006988525390625 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001997683987634942, + "lm_loss": 0.01239013671875, + "loss": 0.0101, + "step": 530, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019976752431117052, + "lm_loss": 0.003204345703125, + "loss": 0.0127, + "step": 531, + "total_loss": 0.003204345703125 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019976664821305068, + "lm_loss": 0.01513671875, + "loss": 0.0096, + "step": 532, + "total_loss": 0.01513671875 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019976577046914922, + "lm_loss": 0.012939453125, + "loss": 0.0099, + "step": 533, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019976489107948058, + "lm_loss": 0.0040283203125, + "loss": 0.0107, + "step": 534, + "total_loss": 0.0040283203125 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001997640100440592, + "lm_loss": 0.0123291015625, + "loss": 0.011, + "step": 535, + "total_loss": 0.0123291015625 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001997631273628997, + "lm_loss": 0.01239013671875, + "loss": 0.0088, + "step": 536, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019976224303601663, + "lm_loss": 0.00799560546875, + "loss": 0.009, + "step": 537, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019976135706342455, + "lm_loss": 0.01953125, + "loss": 0.0126, + "step": 538, + "total_loss": 0.01953125 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019976046944513803, + "lm_loss": 0.01068115234375, + "loss": 0.01, + "step": 539, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975958018117183, + "lm_loss": 0.0087890625, + "loss": 0.011, + "step": 540, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975868927154053, + "lm_loss": 0.00830078125, + "loss": 0.0116, + "step": 541, + "total_loss": 0.00830078125 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975779671625886, + "lm_loss": 0.00927734375, + "loss": 0.0119, + "step": 542, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975690251534153, + "lm_loss": 0.01055908203125, + "loss": 0.0092, + "step": 543, + "total_loss": 0.01055908203125 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975600666880333, + "lm_loss": 0.010009765625, + "loss": 0.0091, + "step": 544, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975510917665895, + "lm_loss": 0.006195068359375, + "loss": 0.0125, + "step": 545, + "total_loss": 0.006195068359375 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001997542100389233, + "lm_loss": 0.01153564453125, + "loss": 0.0099, + "step": 546, + "total_loss": 0.01153564453125 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975330925561113, + "lm_loss": 0.0111083984375, + "loss": 0.0118, + "step": 547, + "total_loss": 0.0111083984375 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975240682673735, + "lm_loss": 0.0025787353515625, + "loss": 0.009, + "step": 548, + "total_loss": 0.0025787353515625 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975150275231682, + "lm_loss": 0.006683349609375, + "loss": 0.0084, + "step": 549, + "total_loss": 0.006683349609375 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019975059703236447, + "lm_loss": 0.0101318359375, + "loss": 0.0088, + "step": 550, + "total_loss": 0.0101318359375 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019974968966689525, + "lm_loss": 0.00543212890625, + "loss": 0.0108, + "step": 551, + "total_loss": 0.00543212890625 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019974878065592407, + "lm_loss": 0.01104736328125, + "loss": 0.008, + "step": 552, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.23, + "learning_rate": 0.000199747869999466, + "lm_loss": 0.01092529296875, + "loss": 0.01, + "step": 553, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019974695769753602, + "lm_loss": 0.01611328125, + "loss": 0.0119, + "step": 554, + "total_loss": 0.01611328125 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001997460437501492, + "lm_loss": 0.005523681640625, + "loss": 0.0124, + "step": 555, + "total_loss": 0.005523681640625 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019974512815732062, + "lm_loss": 0.0037841796875, + "loss": 0.0089, + "step": 556, + "total_loss": 0.0037841796875 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019974421091906535, + "lm_loss": 0.00836181640625, + "loss": 0.01, + "step": 557, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019974329203539855, + "lm_loss": 0.012939453125, + "loss": 0.0114, + "step": 558, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019974237150633534, + "lm_loss": 0.01202392578125, + "loss": 0.014, + "step": 559, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019974144933189097, + "lm_loss": 0.007354736328125, + "loss": 0.0102, + "step": 560, + "total_loss": 0.007354736328125 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001997405255120806, + "lm_loss": 0.00750732421875, + "loss": 0.0115, + "step": 561, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001997396000469195, + "lm_loss": 0.002471923828125, + "loss": 0.0098, + "step": 562, + "total_loss": 0.002471923828125 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019973867293642293, + "lm_loss": 0.0064697265625, + "loss": 0.0084, + "step": 563, + "total_loss": 0.0064697265625 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019973774418060618, + "lm_loss": 0.00787353515625, + "loss": 0.0111, + "step": 564, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019973681377948455, + "lm_loss": 0.0107421875, + "loss": 0.0095, + "step": 565, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001997358817330734, + "lm_loss": 0.01141357421875, + "loss": 0.0108, + "step": 566, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019973494804138815, + "lm_loss": 0.0089111328125, + "loss": 0.0097, + "step": 567, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019973401270444413, + "lm_loss": 0.00787353515625, + "loss": 0.0086, + "step": 568, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019973307572225681, + "lm_loss": 0.005645751953125, + "loss": 0.0093, + "step": 569, + "total_loss": 0.005645751953125 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019973213709484168, + "lm_loss": 0.0125732421875, + "loss": 0.0121, + "step": 570, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019973119682221416, + "lm_loss": 0.01031494140625, + "loss": 0.0081, + "step": 571, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019973025490438976, + "lm_loss": 0.01019287109375, + "loss": 0.0091, + "step": 572, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019972931134138407, + "lm_loss": 0.00927734375, + "loss": 0.0104, + "step": 573, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019972836613321265, + "lm_loss": 0.00775146484375, + "loss": 0.0097, + "step": 574, + "total_loss": 0.00775146484375 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019972741927989102, + "lm_loss": 0.0142822265625, + "loss": 0.011, + "step": 575, + "total_loss": 0.0142822265625 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001997264707814349, + "lm_loss": 0.0047607421875, + "loss": 0.0096, + "step": 576, + "total_loss": 0.0047607421875 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019972552063785988, + "lm_loss": 0.004791259765625, + "loss": 0.0107, + "step": 577, + "total_loss": 0.004791259765625 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019972456884918163, + "lm_loss": 0.0118408203125, + "loss": 0.01, + "step": 578, + "total_loss": 0.0118408203125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019972361541541588, + "lm_loss": 0.01611328125, + "loss": 0.009, + "step": 579, + "total_loss": 0.01611328125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019972266033657833, + "lm_loss": 0.006744384765625, + "loss": 0.0084, + "step": 580, + "total_loss": 0.006744384765625 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019972170361268475, + "lm_loss": 0.01385498046875, + "loss": 0.0126, + "step": 581, + "total_loss": 0.01385498046875 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019972074524375087, + "lm_loss": 0.005645751953125, + "loss": 0.0105, + "step": 582, + "total_loss": 0.005645751953125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971978522979262, + "lm_loss": 0.01104736328125, + "loss": 0.0096, + "step": 583, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971882357082575, + "lm_loss": 0.011474609375, + "loss": 0.0103, + "step": 584, + "total_loss": 0.011474609375 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971786026686612, + "lm_loss": 0.00616455078125, + "loss": 0.0085, + "step": 585, + "total_loss": 0.00616455078125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971689531792967, + "lm_loss": 0.00762939453125, + "loss": 0.0108, + "step": 586, + "total_loss": 0.00762939453125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971592872403227, + "lm_loss": 0.01263427734375, + "loss": 0.0096, + "step": 587, + "total_loss": 0.01263427734375 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971496048518987, + "lm_loss": 0.0084228515625, + "loss": 0.0105, + "step": 588, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971399060141848, + "lm_loss": 0.0081787109375, + "loss": 0.0112, + "step": 589, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971301907273405, + "lm_loss": 0.005462646484375, + "loss": 0.0125, + "step": 590, + "total_loss": 0.005462646484375 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971204589915267, + "lm_loss": 0.015869140625, + "loss": 0.0112, + "step": 591, + "total_loss": 0.015869140625 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971107108069034, + "lm_loss": 0.012939453125, + "loss": 0.0094, + "step": 592, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019971009461736315, + "lm_loss": 0.01153564453125, + "loss": 0.0097, + "step": 593, + "total_loss": 0.01153564453125 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001997091165091872, + "lm_loss": 0.0098876953125, + "loss": 0.0103, + "step": 594, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019970813675617867, + "lm_loss": 0.0078125, + "loss": 0.0092, + "step": 595, + "total_loss": 0.0078125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019970715535835368, + "lm_loss": 0.0074462890625, + "loss": 0.0084, + "step": 596, + "total_loss": 0.0074462890625 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019970617231572843, + "lm_loss": 0.00933837890625, + "loss": 0.0087, + "step": 597, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019970518762831913, + "lm_loss": 0.004425048828125, + "loss": 0.0101, + "step": 598, + "total_loss": 0.004425048828125 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019970420129614204, + "lm_loss": 0.00811767578125, + "loss": 0.0083, + "step": 599, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019970321331921342, + "lm_loss": 0.0087890625, + "loss": 0.0099, + "step": 600, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.25, + "eval_lm_loss": 0.0111524797976017, + "eval_loss": 0.011608276516199112, + "eval_runtime": 44.0118, + "eval_samples_per_second": 22.721, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.0111524797976017, + "lm_loss": 0.00151824951171875, + "step": 600, + "total_loss": 0.00151824951171875 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001997022236975496, + "lm_loss": 0.00909423828125, + "loss": 0.0102, + "step": 601, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019970123243116686, + "lm_loss": 0.0037078857421875, + "loss": 0.0086, + "step": 602, + "total_loss": 0.0037078857421875 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019970023952008153, + "lm_loss": 0.01708984375, + "loss": 0.0113, + "step": 603, + "total_loss": 0.01708984375 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019969924496431008, + "lm_loss": 0.00628662109375, + "loss": 0.0098, + "step": 604, + "total_loss": 0.00628662109375 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019969824876386885, + "lm_loss": 0.006561279296875, + "loss": 0.0084, + "step": 605, + "total_loss": 0.006561279296875 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019969725091877427, + "lm_loss": 0.0186767578125, + "loss": 0.0117, + "step": 606, + "total_loss": 0.0186767578125 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019969625142904284, + "lm_loss": 0.0155029296875, + "loss": 0.0103, + "step": 607, + "total_loss": 0.0155029296875 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019969525029469104, + "lm_loss": 0.009765625, + "loss": 0.0117, + "step": 608, + "total_loss": 0.009765625 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019969424751573533, + "lm_loss": 0.0074462890625, + "loss": 0.0084, + "step": 609, + "total_loss": 0.0074462890625 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019969324309219232, + "lm_loss": 0.0128173828125, + "loss": 0.0099, + "step": 610, + "total_loss": 0.0128173828125 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019969223702407854, + "lm_loss": 0.019775390625, + "loss": 0.0111, + "step": 611, + "total_loss": 0.019775390625 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001996912293114106, + "lm_loss": 0.0142822265625, + "loss": 0.011, + "step": 612, + "total_loss": 0.0142822265625 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019969021995420514, + "lm_loss": 0.003387451171875, + "loss": 0.0093, + "step": 613, + "total_loss": 0.003387451171875 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001996892089524788, + "lm_loss": 0.014404296875, + "loss": 0.0081, + "step": 614, + "total_loss": 0.014404296875 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019968819630624822, + "lm_loss": 0.00390625, + "loss": 0.009, + "step": 615, + "total_loss": 0.00390625 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019968718201553016, + "lm_loss": 0.01043701171875, + "loss": 0.0105, + "step": 616, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001996861660803413, + "lm_loss": 0.01263427734375, + "loss": 0.0116, + "step": 617, + "total_loss": 0.01263427734375 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019968514850069846, + "lm_loss": 0.0125732421875, + "loss": 0.0109, + "step": 618, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019968412927661838, + "lm_loss": 0.00701904296875, + "loss": 0.0087, + "step": 619, + "total_loss": 0.00701904296875 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019968310840811788, + "lm_loss": 0.005859375, + "loss": 0.0095, + "step": 620, + "total_loss": 0.005859375 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001996820858952138, + "lm_loss": 0.01239013671875, + "loss": 0.0102, + "step": 621, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019968106173792304, + "lm_loss": 0.00927734375, + "loss": 0.0109, + "step": 622, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001996800359362625, + "lm_loss": 0.01116943359375, + "loss": 0.0098, + "step": 623, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019967900849024903, + "lm_loss": 0.01031494140625, + "loss": 0.011, + "step": 624, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001996779793998996, + "lm_loss": 0.019287109375, + "loss": 0.0112, + "step": 625, + "total_loss": 0.019287109375 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019967694866523125, + "lm_loss": 0.01434326171875, + "loss": 0.0112, + "step": 626, + "total_loss": 0.01434326171875 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001996759162862609, + "lm_loss": 0.014404296875, + "loss": 0.0096, + "step": 627, + "total_loss": 0.014404296875 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019967488226300568, + "lm_loss": 0.0238037109375, + "loss": 0.0119, + "step": 628, + "total_loss": 0.0238037109375 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019967384659548256, + "lm_loss": 0.0164794921875, + "loss": 0.0089, + "step": 629, + "total_loss": 0.0164794921875 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019967280928370867, + "lm_loss": 0.0078125, + "loss": 0.0104, + "step": 630, + "total_loss": 0.0078125 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019967177032770107, + "lm_loss": 0.01385498046875, + "loss": 0.0092, + "step": 631, + "total_loss": 0.01385498046875 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019967072972747695, + "lm_loss": 0.00732421875, + "loss": 0.0106, + "step": 632, + "total_loss": 0.00732421875 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019966968748305345, + "lm_loss": 0.0106201171875, + "loss": 0.0114, + "step": 633, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001996686435944478, + "lm_loss": 0.0032501220703125, + "loss": 0.011, + "step": 634, + "total_loss": 0.0032501220703125 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019966759806167717, + "lm_loss": 0.005615234375, + "loss": 0.0087, + "step": 635, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019966655088475883, + "lm_loss": 0.01708984375, + "loss": 0.0082, + "step": 636, + "total_loss": 0.01708984375 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001996655020637101, + "lm_loss": 0.01275634765625, + "loss": 0.0097, + "step": 637, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001996644515985482, + "lm_loss": 0.012451171875, + "loss": 0.011, + "step": 638, + "total_loss": 0.012451171875 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001996633994892905, + "lm_loss": 0.01275634765625, + "loss": 0.0098, + "step": 639, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019966234573595435, + "lm_loss": 0.003204345703125, + "loss": 0.0095, + "step": 640, + "total_loss": 0.003204345703125 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019966129033855714, + "lm_loss": 0.016357421875, + "loss": 0.0142, + "step": 641, + "total_loss": 0.016357421875 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019966023329711627, + "lm_loss": 0.01312255859375, + "loss": 0.0104, + "step": 642, + "total_loss": 0.01312255859375 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019965917461164918, + "lm_loss": 0.0118408203125, + "loss": 0.0089, + "step": 643, + "total_loss": 0.0118408203125 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019965811428217337, + "lm_loss": 0.005401611328125, + "loss": 0.0092, + "step": 644, + "total_loss": 0.005401611328125 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019965705230870625, + "lm_loss": 0.005340576171875, + "loss": 0.0093, + "step": 645, + "total_loss": 0.005340576171875 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019965598869126544, + "lm_loss": 0.00872802734375, + "loss": 0.011, + "step": 646, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001996549234298684, + "lm_loss": 0.017333984375, + "loss": 0.0093, + "step": 647, + "total_loss": 0.017333984375 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019965385652453272, + "lm_loss": 0.007232666015625, + "loss": 0.0099, + "step": 648, + "total_loss": 0.007232666015625 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019965278797527604, + "lm_loss": 0.01141357421875, + "loss": 0.0093, + "step": 649, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.27, + "learning_rate": 0.000199651717782116, + "lm_loss": 0.00946044921875, + "loss": 0.0114, + "step": 650, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019965064594507014, + "lm_loss": 0.0159912109375, + "loss": 0.0107, + "step": 651, + "total_loss": 0.0159912109375 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019964957246415628, + "lm_loss": 0.01422119140625, + "loss": 0.0119, + "step": 652, + "total_loss": 0.01422119140625 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019964849733939205, + "lm_loss": 0.01544189453125, + "loss": 0.0115, + "step": 653, + "total_loss": 0.01544189453125 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019964742057079518, + "lm_loss": 0.00506591796875, + "loss": 0.0093, + "step": 654, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996463421583835, + "lm_loss": 0.01171875, + "loss": 0.0104, + "step": 655, + "total_loss": 0.01171875 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996452621021747, + "lm_loss": 0.0020294189453125, + "loss": 0.008, + "step": 656, + "total_loss": 0.0020294189453125 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996441804021867, + "lm_loss": 0.0031585693359375, + "loss": 0.0094, + "step": 657, + "total_loss": 0.0031585693359375 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019964309705843727, + "lm_loss": 0.0098876953125, + "loss": 0.0094, + "step": 658, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996420120709443, + "lm_loss": 0.00909423828125, + "loss": 0.0093, + "step": 659, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019964092543972575, + "lm_loss": 0.007781982421875, + "loss": 0.009, + "step": 660, + "total_loss": 0.007781982421875 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019963983716479944, + "lm_loss": 0.009521484375, + "loss": 0.0088, + "step": 661, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019963874724618336, + "lm_loss": 0.006927490234375, + "loss": 0.0091, + "step": 662, + "total_loss": 0.006927490234375 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019963765568389557, + "lm_loss": 0.01123046875, + "loss": 0.01, + "step": 663, + "total_loss": 0.01123046875 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019963656247795397, + "lm_loss": 0.01055908203125, + "loss": 0.0124, + "step": 664, + "total_loss": 0.01055908203125 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019963546762837662, + "lm_loss": 0.01446533203125, + "loss": 0.012, + "step": 665, + "total_loss": 0.01446533203125 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001996343711351816, + "lm_loss": 0.0036163330078125, + "loss": 0.0107, + "step": 666, + "total_loss": 0.0036163330078125 + }, + { + "epoch": 0.27, + "learning_rate": 0.000199633272998387, + "lm_loss": 0.007232666015625, + "loss": 0.0102, + "step": 667, + "total_loss": 0.007232666015625 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019963217321801094, + "lm_loss": 0.0185546875, + "loss": 0.0104, + "step": 668, + "total_loss": 0.0185546875 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019963107179407156, + "lm_loss": 0.006683349609375, + "loss": 0.0101, + "step": 669, + "total_loss": 0.006683349609375 + }, + { + "epoch": 0.27, + "learning_rate": 0.000199629968726587, + "lm_loss": 0.010986328125, + "loss": 0.0098, + "step": 670, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019962886401557548, + "lm_loss": 0.01104736328125, + "loss": 0.0091, + "step": 671, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019962775766105522, + "lm_loss": 0.00628662109375, + "loss": 0.01, + "step": 672, + "total_loss": 0.00628662109375 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019962664966304447, + "lm_loss": 0.01007080078125, + "loss": 0.0118, + "step": 673, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001996255400215615, + "lm_loss": 0.01025390625, + "loss": 0.0119, + "step": 674, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019962442873662465, + "lm_loss": 0.0074462890625, + "loss": 0.0102, + "step": 675, + "total_loss": 0.0074462890625 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019962331580825223, + "lm_loss": 0.00653076171875, + "loss": 0.0092, + "step": 676, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019962220123646255, + "lm_loss": 0.01171875, + "loss": 0.0114, + "step": 677, + "total_loss": 0.01171875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019962108502127407, + "lm_loss": 0.0096435546875, + "loss": 0.0088, + "step": 678, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001996199671627052, + "lm_loss": 0.01239013671875, + "loss": 0.0107, + "step": 679, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019961884766077432, + "lm_loss": 0.004058837890625, + "loss": 0.0102, + "step": 680, + "total_loss": 0.004058837890625 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019961772651549994, + "lm_loss": 0.006317138671875, + "loss": 0.0099, + "step": 681, + "total_loss": 0.006317138671875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019961660372690054, + "lm_loss": 0.00946044921875, + "loss": 0.0107, + "step": 682, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019961547929499466, + "lm_loss": 0.005706787109375, + "loss": 0.0094, + "step": 683, + "total_loss": 0.005706787109375 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019961435321980085, + "lm_loss": 0.0047607421875, + "loss": 0.0077, + "step": 684, + "total_loss": 0.0047607421875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019961322550133764, + "lm_loss": 0.0101318359375, + "loss": 0.0096, + "step": 685, + "total_loss": 0.0101318359375 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019961209613962374, + "lm_loss": 0.01531982421875, + "loss": 0.0085, + "step": 686, + "total_loss": 0.01531982421875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019961096513467763, + "lm_loss": 0.0072021484375, + "loss": 0.0123, + "step": 687, + "total_loss": 0.0072021484375 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001996098324865181, + "lm_loss": 0.009033203125, + "loss": 0.0089, + "step": 688, + "total_loss": 0.009033203125 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019960869819516376, + "lm_loss": 0.01080322265625, + "loss": 0.01, + "step": 689, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019960756226063332, + "lm_loss": 0.00531005859375, + "loss": 0.0109, + "step": 690, + "total_loss": 0.00531005859375 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019960642468294556, + "lm_loss": 0.006591796875, + "loss": 0.0096, + "step": 691, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019960528546211922, + "lm_loss": 0.00482177734375, + "loss": 0.0091, + "step": 692, + "total_loss": 0.00482177734375 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019960414459817312, + "lm_loss": 0.0107421875, + "loss": 0.0082, + "step": 693, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019960300209112607, + "lm_loss": 0.00360107421875, + "loss": 0.0112, + "step": 694, + "total_loss": 0.00360107421875 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019960185794099685, + "lm_loss": 0.006011962890625, + "loss": 0.0105, + "step": 695, + "total_loss": 0.006011962890625 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019960071214780443, + "lm_loss": 0.009521484375, + "loss": 0.0113, + "step": 696, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001995995647115677, + "lm_loss": 0.0108642578125, + "loss": 0.0089, + "step": 697, + "total_loss": 0.0108642578125 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001995984156323055, + "lm_loss": 0.01263427734375, + "loss": 0.0098, + "step": 698, + "total_loss": 0.01263427734375 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019959726491003687, + "lm_loss": 0.00958251953125, + "loss": 0.0081, + "step": 699, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019959611254478078, + "lm_loss": 0.006561279296875, + "loss": 0.0094, + "step": 700, + "total_loss": 0.006561279296875 + }, + { + "epoch": 0.29, + "eval_lm_loss": 0.01146012730896473, + "eval_loss": 0.011966399848461151, + "eval_runtime": 43.9553, + "eval_samples_per_second": 22.75, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.01146012730896473, + "lm_loss": 0.00103759765625, + "step": 700, + "total_loss": 0.00103759765625 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019959495853655624, + "lm_loss": 0.01141357421875, + "loss": 0.0105, + "step": 701, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019959380288538223, + "lm_loss": 0.008544921875, + "loss": 0.0111, + "step": 702, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001995926455912779, + "lm_loss": 0.01373291015625, + "loss": 0.0101, + "step": 703, + "total_loss": 0.01373291015625 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001995914866542623, + "lm_loss": 0.01141357421875, + "loss": 0.0088, + "step": 704, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019959032607435453, + "lm_loss": 0.0089111328125, + "loss": 0.0097, + "step": 705, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958916385157376, + "lm_loss": 0.007598876953125, + "loss": 0.0088, + "step": 706, + "total_loss": 0.007598876953125 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958799998593917, + "lm_loss": 0.00823974609375, + "loss": 0.0102, + "step": 707, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958683447746992, + "lm_loss": 0.0096435546875, + "loss": 0.0089, + "step": 708, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958566732618529, + "lm_loss": 0.00640869140625, + "loss": 0.0076, + "step": 709, + "total_loss": 0.00640869140625 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958449853210448, + "lm_loss": 0.0107421875, + "loss": 0.0102, + "step": 710, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958332809524683, + "lm_loss": 0.00933837890625, + "loss": 0.0085, + "step": 711, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958215601563158, + "lm_loss": 0.00823974609375, + "loss": 0.0093, + "step": 712, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019958098229327814, + "lm_loss": 0.00897216796875, + "loss": 0.0097, + "step": 713, + "total_loss": 0.00897216796875 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957980692820578, + "lm_loss": 0.00811767578125, + "loss": 0.0095, + "step": 714, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.29, + "learning_rate": 0.000199578629920434, + "lm_loss": 0.0203857421875, + "loss": 0.0118, + "step": 715, + "total_loss": 0.0203857421875 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957745126998212, + "lm_loss": 0.007415771484375, + "loss": 0.0099, + "step": 716, + "total_loss": 0.007415771484375 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957627097686963, + "lm_loss": 0.007049560546875, + "loss": 0.0073, + "step": 717, + "total_loss": 0.007049560546875 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957508904111596, + "lm_loss": 0.01708984375, + "loss": 0.0085, + "step": 718, + "total_loss": 0.01708984375 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001995739054627407, + "lm_loss": 0.01904296875, + "loss": 0.0108, + "step": 719, + "total_loss": 0.01904296875 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019957272024176328, + "lm_loss": 0.005584716796875, + "loss": 0.0115, + "step": 720, + "total_loss": 0.005584716796875 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001995715333782033, + "lm_loss": 0.0145263671875, + "loss": 0.0139, + "step": 721, + "total_loss": 0.0145263671875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019957034487208032, + "lm_loss": 0.0093994140625, + "loss": 0.0103, + "step": 722, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019956915472341393, + "lm_loss": 0.01556396484375, + "loss": 0.0111, + "step": 723, + "total_loss": 0.01556396484375 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995679629322238, + "lm_loss": 0.017333984375, + "loss": 0.0113, + "step": 724, + "total_loss": 0.017333984375 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019956676949852957, + "lm_loss": 0.0169677734375, + "loss": 0.0103, + "step": 725, + "total_loss": 0.0169677734375 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019956557442235093, + "lm_loss": 0.01324462890625, + "loss": 0.0104, + "step": 726, + "total_loss": 0.01324462890625 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995643777037076, + "lm_loss": 0.0113525390625, + "loss": 0.0112, + "step": 727, + "total_loss": 0.0113525390625 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019956317934261933, + "lm_loss": 0.00946044921875, + "loss": 0.0106, + "step": 728, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019956197933910583, + "lm_loss": 0.0086669921875, + "loss": 0.0111, + "step": 729, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019956077769318697, + "lm_loss": 0.0106201171875, + "loss": 0.009, + "step": 730, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019955957440488255, + "lm_loss": 0.01123046875, + "loss": 0.0102, + "step": 731, + "total_loss": 0.01123046875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019955836947421242, + "lm_loss": 0.006805419921875, + "loss": 0.0108, + "step": 732, + "total_loss": 0.006805419921875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019955716290119644, + "lm_loss": 0.01177978515625, + "loss": 0.0097, + "step": 733, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995559546858545, + "lm_loss": 0.01092529296875, + "loss": 0.0105, + "step": 734, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995547448282066, + "lm_loss": 0.01190185546875, + "loss": 0.0091, + "step": 735, + "total_loss": 0.01190185546875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019955353332827262, + "lm_loss": 0.01031494140625, + "loss": 0.0105, + "step": 736, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995523201860726, + "lm_loss": 0.0130615234375, + "loss": 0.0118, + "step": 737, + "total_loss": 0.0130615234375 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995511054016265, + "lm_loss": 0.0096435546875, + "loss": 0.008, + "step": 738, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954988897495442, + "lm_loss": 0.006317138671875, + "loss": 0.0089, + "step": 739, + "total_loss": 0.006317138671875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954867090607638, + "lm_loss": 0.0068359375, + "loss": 0.0095, + "step": 740, + "total_loss": 0.0068359375 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954745119501254, + "lm_loss": 0.0084228515625, + "loss": 0.0101, + "step": 741, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995462298417829, + "lm_loss": 0.003997802734375, + "loss": 0.0094, + "step": 742, + "total_loss": 0.003997802734375 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954500684640772, + "lm_loss": 0.01275634765625, + "loss": 0.0128, + "step": 743, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001995437822089071, + "lm_loss": 0.0106201171875, + "loss": 0.0086, + "step": 744, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954255592930133, + "lm_loss": 0.0093994140625, + "loss": 0.0082, + "step": 745, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019954132800761057, + "lm_loss": 0.006927490234375, + "loss": 0.0083, + "step": 746, + "total_loss": 0.006927490234375 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019954009844385507, + "lm_loss": 0.01068115234375, + "loss": 0.0111, + "step": 747, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019953886723805513, + "lm_loss": 0.005157470703125, + "loss": 0.0086, + "step": 748, + "total_loss": 0.005157470703125 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019953763439023109, + "lm_loss": 0.015380859375, + "loss": 0.0111, + "step": 749, + "total_loss": 0.015380859375 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019953639990040323, + "lm_loss": 0.00787353515625, + "loss": 0.0086, + "step": 750, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019953516376859198, + "lm_loss": 0.0108642578125, + "loss": 0.011, + "step": 751, + "total_loss": 0.0108642578125 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001995339259948177, + "lm_loss": 0.01513671875, + "loss": 0.0096, + "step": 752, + "total_loss": 0.01513671875 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019953268657910077, + "lm_loss": 0.004730224609375, + "loss": 0.0104, + "step": 753, + "total_loss": 0.004730224609375 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019953144552146173, + "lm_loss": 0.011962890625, + "loss": 0.0092, + "step": 754, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019953020282192094, + "lm_loss": 0.01141357421875, + "loss": 0.0086, + "step": 755, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.31, + "learning_rate": 0.000199528958480499, + "lm_loss": 0.016357421875, + "loss": 0.0091, + "step": 756, + "total_loss": 0.016357421875 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019952771249721635, + "lm_loss": 0.00604248046875, + "loss": 0.0099, + "step": 757, + "total_loss": 0.00604248046875 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019952646487209362, + "lm_loss": 0.01361083984375, + "loss": 0.0115, + "step": 758, + "total_loss": 0.01361083984375 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019952521560515137, + "lm_loss": 0.00628662109375, + "loss": 0.0101, + "step": 759, + "total_loss": 0.00628662109375 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019952396469641016, + "lm_loss": 0.01165771484375, + "loss": 0.0102, + "step": 760, + "total_loss": 0.01165771484375 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001995227121458907, + "lm_loss": 0.0130615234375, + "loss": 0.009, + "step": 761, + "total_loss": 0.0130615234375 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001995214579536136, + "lm_loss": 0.01177978515625, + "loss": 0.0087, + "step": 762, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019952020211959953, + "lm_loss": 0.0103759765625, + "loss": 0.0112, + "step": 763, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019951894464386927, + "lm_loss": 0.01031494140625, + "loss": 0.0088, + "step": 764, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019951768552644355, + "lm_loss": 0.004852294921875, + "loss": 0.0106, + "step": 765, + "total_loss": 0.004852294921875 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001995164247673431, + "lm_loss": 0.01806640625, + "loss": 0.0093, + "step": 766, + "total_loss": 0.01806640625 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019951516236658872, + "lm_loss": 0.0152587890625, + "loss": 0.0127, + "step": 767, + "total_loss": 0.0152587890625 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001995138983242013, + "lm_loss": 0.007293701171875, + "loss": 0.008, + "step": 768, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019951263264020167, + "lm_loss": 0.00457763671875, + "loss": 0.0089, + "step": 769, + "total_loss": 0.00457763671875 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019951136531461066, + "lm_loss": 0.00823974609375, + "loss": 0.0112, + "step": 770, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019951009634744922, + "lm_loss": 0.0120849609375, + "loss": 0.01, + "step": 771, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019950882573873822, + "lm_loss": 0.0107421875, + "loss": 0.0103, + "step": 772, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001995075534884987, + "lm_loss": 0.01202392578125, + "loss": 0.0108, + "step": 773, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019950627959675163, + "lm_loss": 0.01043701171875, + "loss": 0.0093, + "step": 774, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019950500406351802, + "lm_loss": 0.0166015625, + "loss": 0.0117, + "step": 775, + "total_loss": 0.0166015625 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019950372688881887, + "lm_loss": 0.01043701171875, + "loss": 0.0093, + "step": 776, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019950244807267532, + "lm_loss": 0.01019287109375, + "loss": 0.0101, + "step": 777, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001995011676151084, + "lm_loss": 0.0087890625, + "loss": 0.0083, + "step": 778, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019949988551613924, + "lm_loss": 0.01422119140625, + "loss": 0.0099, + "step": 779, + "total_loss": 0.01422119140625 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019949860177578902, + "lm_loss": 0.01141357421875, + "loss": 0.0091, + "step": 780, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019949731639407894, + "lm_loss": 0.0084228515625, + "loss": 0.0109, + "step": 781, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019949602937103015, + "lm_loss": 0.0147705078125, + "loss": 0.009, + "step": 782, + "total_loss": 0.0147705078125 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994947407066639, + "lm_loss": 0.006317138671875, + "loss": 0.0114, + "step": 783, + "total_loss": 0.006317138671875 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019949345040100146, + "lm_loss": 0.0027313232421875, + "loss": 0.0111, + "step": 784, + "total_loss": 0.0027313232421875 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994921584540641, + "lm_loss": 0.0091552734375, + "loss": 0.0087, + "step": 785, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019949086486587315, + "lm_loss": 0.00341796875, + "loss": 0.0089, + "step": 786, + "total_loss": 0.00341796875 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019948956963644992, + "lm_loss": 0.00506591796875, + "loss": 0.0081, + "step": 787, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994882727658158, + "lm_loss": 0.01171875, + "loss": 0.0108, + "step": 788, + "total_loss": 0.01171875 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019948697425399217, + "lm_loss": 0.00982666015625, + "loss": 0.0097, + "step": 789, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019948567410100045, + "lm_loss": 0.00994873046875, + "loss": 0.0089, + "step": 790, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994843723068621, + "lm_loss": 0.01177978515625, + "loss": 0.0124, + "step": 791, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001994830688715986, + "lm_loss": 0.01312255859375, + "loss": 0.01, + "step": 792, + "total_loss": 0.01312255859375 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019948176379523146, + "lm_loss": 0.012451171875, + "loss": 0.01, + "step": 793, + "total_loss": 0.012451171875 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019948045707778218, + "lm_loss": 0.0115966796875, + "loss": 0.0098, + "step": 794, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019947914871927232, + "lm_loss": 0.0191650390625, + "loss": 0.0091, + "step": 795, + "total_loss": 0.0191650390625 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019947783871972346, + "lm_loss": 0.0062255859375, + "loss": 0.0093, + "step": 796, + "total_loss": 0.0062255859375 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019947652707915723, + "lm_loss": 0.01220703125, + "loss": 0.0117, + "step": 797, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019947521379759525, + "lm_loss": 0.005279541015625, + "loss": 0.0092, + "step": 798, + "total_loss": 0.005279541015625 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019947389887505922, + "lm_loss": 0.0091552734375, + "loss": 0.0091, + "step": 799, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019947258231157078, + "lm_loss": 0.01220703125, + "loss": 0.0094, + "step": 800, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.33, + "eval_lm_loss": 0.010947369039058685, + "eval_loss": 0.011480826884508133, + "eval_runtime": 43.9248, + "eval_samples_per_second": 22.766, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.010947369039058685, + "lm_loss": 0.00130462646484375, + "step": 800, + "total_loss": 0.00130462646484375 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019947126410715168, + "lm_loss": 0.006591796875, + "loss": 0.0111, + "step": 801, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019946994426182365, + "lm_loss": 0.0091552734375, + "loss": 0.0129, + "step": 802, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001994686227756085, + "lm_loss": 0.01239013671875, + "loss": 0.0091, + "step": 803, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019946729964852796, + "lm_loss": 0.00860595703125, + "loss": 0.0102, + "step": 804, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001994659748806039, + "lm_loss": 0.0184326171875, + "loss": 0.0099, + "step": 805, + "total_loss": 0.0184326171875 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001994646484718582, + "lm_loss": 0.00848388671875, + "loss": 0.0116, + "step": 806, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001994633204223127, + "lm_loss": 0.011962890625, + "loss": 0.0086, + "step": 807, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001994619907319893, + "lm_loss": 0.00592041015625, + "loss": 0.0091, + "step": 808, + "total_loss": 0.00592041015625 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019946065940090998, + "lm_loss": 0.007080078125, + "loss": 0.0072, + "step": 809, + "total_loss": 0.007080078125 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019945932642909667, + "lm_loss": 0.0185546875, + "loss": 0.0103, + "step": 810, + "total_loss": 0.0185546875 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019945799181657133, + "lm_loss": 0.01275634765625, + "loss": 0.0113, + "step": 811, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019945665556335605, + "lm_loss": 0.003936767578125, + "loss": 0.0097, + "step": 812, + "total_loss": 0.003936767578125 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001994553176694728, + "lm_loss": 0.00738525390625, + "loss": 0.0082, + "step": 813, + "total_loss": 0.00738525390625 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019945397813494374, + "lm_loss": 0.01171875, + "loss": 0.01, + "step": 814, + "total_loss": 0.01171875 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019945263695979087, + "lm_loss": 0.00823974609375, + "loss": 0.0098, + "step": 815, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019945129414403637, + "lm_loss": 0.01116943359375, + "loss": 0.0091, + "step": 816, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019944994968770237, + "lm_loss": 0.006591796875, + "loss": 0.0073, + "step": 817, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019944860359081106, + "lm_loss": 0.0137939453125, + "loss": 0.0082, + "step": 818, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019944725585338465, + "lm_loss": 0.0101318359375, + "loss": 0.0103, + "step": 819, + "total_loss": 0.0101318359375 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019944590647544535, + "lm_loss": 0.006439208984375, + "loss": 0.0089, + "step": 820, + "total_loss": 0.006439208984375 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019944455545701546, + "lm_loss": 0.005279541015625, + "loss": 0.0081, + "step": 821, + "total_loss": 0.005279541015625 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019944320279811724, + "lm_loss": 0.0096435546875, + "loss": 0.0094, + "step": 822, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019944184849877297, + "lm_loss": 0.00836181640625, + "loss": 0.0095, + "step": 823, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019944049255900506, + "lm_loss": 0.00537109375, + "loss": 0.0092, + "step": 824, + "total_loss": 0.00537109375 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019943913497883582, + "lm_loss": 0.00787353515625, + "loss": 0.0094, + "step": 825, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001994377757582877, + "lm_loss": 0.00811767578125, + "loss": 0.008, + "step": 826, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019943641489738307, + "lm_loss": 0.0062255859375, + "loss": 0.0103, + "step": 827, + "total_loss": 0.0062255859375 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001994350523961444, + "lm_loss": 0.00714111328125, + "loss": 0.0097, + "step": 828, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019943368825459419, + "lm_loss": 0.003753662109375, + "loss": 0.0083, + "step": 829, + "total_loss": 0.003753662109375 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001994323224727549, + "lm_loss": 0.010009765625, + "loss": 0.0098, + "step": 830, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019943095505064909, + "lm_loss": 0.0120849609375, + "loss": 0.0092, + "step": 831, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001994295859882993, + "lm_loss": 0.00970458984375, + "loss": 0.0095, + "step": 832, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019942821528572813, + "lm_loss": 0.005218505859375, + "loss": 0.0085, + "step": 833, + "total_loss": 0.005218505859375 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019942684294295816, + "lm_loss": 0.00860595703125, + "loss": 0.0074, + "step": 834, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019942546896001208, + "lm_loss": 0.01104736328125, + "loss": 0.012, + "step": 835, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019942409333691252, + "lm_loss": 0.005950927734375, + "loss": 0.0089, + "step": 836, + "total_loss": 0.005950927734375 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001994227160736822, + "lm_loss": 0.0078125, + "loss": 0.0081, + "step": 837, + "total_loss": 0.0078125 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019942133717034378, + "lm_loss": 0.01031494140625, + "loss": 0.0115, + "step": 838, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001994199566269201, + "lm_loss": 0.00958251953125, + "loss": 0.0089, + "step": 839, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019941857444343383, + "lm_loss": 0.0045166015625, + "loss": 0.0087, + "step": 840, + "total_loss": 0.0045166015625 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019941719061990787, + "lm_loss": 0.00701904296875, + "loss": 0.012, + "step": 841, + "total_loss": 0.00701904296875 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019941580515636496, + "lm_loss": 0.0142822265625, + "loss": 0.0108, + "step": 842, + "total_loss": 0.0142822265625 + }, + { + "epoch": 0.34, + "learning_rate": 0.000199414418052828, + "lm_loss": 0.01007080078125, + "loss": 0.0092, + "step": 843, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001994130293093199, + "lm_loss": 0.006561279296875, + "loss": 0.0101, + "step": 844, + "total_loss": 0.006561279296875 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019941163892586353, + "lm_loss": 0.01177978515625, + "loss": 0.01, + "step": 845, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001994102469024818, + "lm_loss": 0.01312255859375, + "loss": 0.0101, + "step": 846, + "total_loss": 0.01312255859375 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019940885323919773, + "lm_loss": 0.0126953125, + "loss": 0.0099, + "step": 847, + "total_loss": 0.0126953125 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019940745793603428, + "lm_loss": 0.016357421875, + "loss": 0.0104, + "step": 848, + "total_loss": 0.016357421875 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001994060609930145, + "lm_loss": 0.0072021484375, + "loss": 0.0098, + "step": 849, + "total_loss": 0.0072021484375 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019940466241016137, + "lm_loss": 0.007659912109375, + "loss": 0.0087, + "step": 850, + "total_loss": 0.007659912109375 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019940326218749802, + "lm_loss": 0.01214599609375, + "loss": 0.0095, + "step": 851, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019940186032504751, + "lm_loss": 0.00750732421875, + "loss": 0.0119, + "step": 852, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.35, + "learning_rate": 0.000199400456822833, + "lm_loss": 0.01092529296875, + "loss": 0.0098, + "step": 853, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019939905168087763, + "lm_loss": 0.0108642578125, + "loss": 0.0087, + "step": 854, + "total_loss": 0.0108642578125 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019939764489920457, + "lm_loss": 0.00787353515625, + "loss": 0.0082, + "step": 855, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019939623647783705, + "lm_loss": 0.01129150390625, + "loss": 0.0105, + "step": 856, + "total_loss": 0.01129150390625 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019939482641679826, + "lm_loss": 0.0045166015625, + "loss": 0.0098, + "step": 857, + "total_loss": 0.0045166015625 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001993934147161115, + "lm_loss": 0.01336669921875, + "loss": 0.0097, + "step": 858, + "total_loss": 0.01336669921875 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019939200137580003, + "lm_loss": 0.006500244140625, + "loss": 0.0092, + "step": 859, + "total_loss": 0.006500244140625 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001993905863958872, + "lm_loss": 0.0084228515625, + "loss": 0.0086, + "step": 860, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019938916977639632, + "lm_loss": 0.0040283203125, + "loss": 0.0113, + "step": 861, + "total_loss": 0.0040283203125 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019938775151735077, + "lm_loss": 0.013916015625, + "loss": 0.0085, + "step": 862, + "total_loss": 0.013916015625 + }, + { + "epoch": 0.35, + "learning_rate": 0.000199386331618774, + "lm_loss": 0.00958251953125, + "loss": 0.0102, + "step": 863, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019938491008068932, + "lm_loss": 0.006378173828125, + "loss": 0.0096, + "step": 864, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019938348690312022, + "lm_loss": 0.01373291015625, + "loss": 0.0089, + "step": 865, + "total_loss": 0.01373291015625 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019938206208609024, + "lm_loss": 0.01019287109375, + "loss": 0.0108, + "step": 866, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019938063562962283, + "lm_loss": 0.00885009765625, + "loss": 0.009, + "step": 867, + "total_loss": 0.00885009765625 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019937920753374156, + "lm_loss": 0.010498046875, + "loss": 0.0107, + "step": 868, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001993777777984699, + "lm_loss": 0.0167236328125, + "loss": 0.0096, + "step": 869, + "total_loss": 0.0167236328125 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019937634642383157, + "lm_loss": 0.0150146484375, + "loss": 0.0099, + "step": 870, + "total_loss": 0.0150146484375 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019937491340985005, + "lm_loss": 0.01324462890625, + "loss": 0.0094, + "step": 871, + "total_loss": 0.01324462890625 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019937347875654908, + "lm_loss": 0.01068115234375, + "loss": 0.0092, + "step": 872, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019937204246395226, + "lm_loss": 0.0146484375, + "loss": 0.0104, + "step": 873, + "total_loss": 0.0146484375 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019937060453208334, + "lm_loss": 0.00518798828125, + "loss": 0.0104, + "step": 874, + "total_loss": 0.00518798828125 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019936916496096597, + "lm_loss": 0.01043701171875, + "loss": 0.0083, + "step": 875, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019936772375062396, + "lm_loss": 0.0120849609375, + "loss": 0.0101, + "step": 876, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019936628090108104, + "lm_loss": 0.0103759765625, + "loss": 0.0091, + "step": 877, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019936483641236105, + "lm_loss": 0.0140380859375, + "loss": 0.0095, + "step": 878, + "total_loss": 0.0140380859375 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019936339028448781, + "lm_loss": 0.0113525390625, + "loss": 0.0089, + "step": 879, + "total_loss": 0.0113525390625 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019936194251748518, + "lm_loss": 0.0167236328125, + "loss": 0.0091, + "step": 880, + "total_loss": 0.0167236328125 + }, + { + "epoch": 0.36, + "learning_rate": 0.000199360493111377, + "lm_loss": 0.007415771484375, + "loss": 0.0103, + "step": 881, + "total_loss": 0.007415771484375 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001993590420661872, + "lm_loss": 0.00836181640625, + "loss": 0.0106, + "step": 882, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019935758938193976, + "lm_loss": 0.005950927734375, + "loss": 0.0078, + "step": 883, + "total_loss": 0.005950927734375 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019935613505865857, + "lm_loss": 0.0089111328125, + "loss": 0.0075, + "step": 884, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001993546790963677, + "lm_loss": 0.00872802734375, + "loss": 0.0123, + "step": 885, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019935322149509114, + "lm_loss": 0.0054931640625, + "loss": 0.0091, + "step": 886, + "total_loss": 0.0054931640625 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019935176225485292, + "lm_loss": 0.00970458984375, + "loss": 0.0083, + "step": 887, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019935030137567707, + "lm_loss": 0.0123291015625, + "loss": 0.0093, + "step": 888, + "total_loss": 0.0123291015625 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019934883885758777, + "lm_loss": 0.005767822265625, + "loss": 0.0097, + "step": 889, + "total_loss": 0.005767822265625 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001993473747006091, + "lm_loss": 0.01409912109375, + "loss": 0.0098, + "step": 890, + "total_loss": 0.01409912109375 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019934590890476525, + "lm_loss": 0.013671875, + "loss": 0.0075, + "step": 891, + "total_loss": 0.013671875 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019934444147008036, + "lm_loss": 0.00579833984375, + "loss": 0.0087, + "step": 892, + "total_loss": 0.00579833984375 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019934297239657865, + "lm_loss": 0.015380859375, + "loss": 0.0092, + "step": 893, + "total_loss": 0.015380859375 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019934150168428438, + "lm_loss": 0.00714111328125, + "loss": 0.0098, + "step": 894, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019934002933322175, + "lm_loss": 0.01348876953125, + "loss": 0.0084, + "step": 895, + "total_loss": 0.01348876953125 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019933855534341513, + "lm_loss": 0.00750732421875, + "loss": 0.009, + "step": 896, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019933707971488875, + "lm_loss": 0.00958251953125, + "loss": 0.0102, + "step": 897, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019933560244766703, + "lm_loss": 0.0166015625, + "loss": 0.0115, + "step": 898, + "total_loss": 0.0166015625 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019933412354177426, + "lm_loss": 0.00592041015625, + "loss": 0.0086, + "step": 899, + "total_loss": 0.00592041015625 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019933264299723495, + "lm_loss": 0.01202392578125, + "loss": 0.0101, + "step": 900, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.37, + "eval_lm_loss": 0.010967607609927654, + "eval_loss": 0.011497192084789276, + "eval_runtime": 43.9954, + "eval_samples_per_second": 22.73, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.010967607609927654, + "lm_loss": 0.000972747802734375, + "step": 900, + "total_loss": 0.000972747802734375 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019933116081407342, + "lm_loss": 0.01031494140625, + "loss": 0.0111, + "step": 901, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019932967699231414, + "lm_loss": 0.0101318359375, + "loss": 0.0092, + "step": 902, + "total_loss": 0.0101318359375 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001993281915319816, + "lm_loss": 0.00689697265625, + "loss": 0.0111, + "step": 903, + "total_loss": 0.00689697265625 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019932670443310033, + "lm_loss": 0.0115966796875, + "loss": 0.0087, + "step": 904, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019932521569569485, + "lm_loss": 0.006500244140625, + "loss": 0.0109, + "step": 905, + "total_loss": 0.006500244140625 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019932372531978966, + "lm_loss": 0.00872802734375, + "loss": 0.0102, + "step": 906, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019932223330540946, + "lm_loss": 0.0089111328125, + "loss": 0.0095, + "step": 907, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019932073965257872, + "lm_loss": 0.012939453125, + "loss": 0.0102, + "step": 908, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001993192443613222, + "lm_loss": 0.0111083984375, + "loss": 0.0092, + "step": 909, + "total_loss": 0.0111083984375 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019931774743166454, + "lm_loss": 0.0029754638671875, + "loss": 0.012, + "step": 910, + "total_loss": 0.0029754638671875 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001993162488636304, + "lm_loss": 0.007049560546875, + "loss": 0.0091, + "step": 911, + "total_loss": 0.007049560546875 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019931474865724448, + "lm_loss": 0.0130615234375, + "loss": 0.0098, + "step": 912, + "total_loss": 0.0130615234375 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001993132468125316, + "lm_loss": 0.01043701171875, + "loss": 0.01, + "step": 913, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019931174332951649, + "lm_loss": 0.0089111328125, + "loss": 0.0099, + "step": 914, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019931023820822397, + "lm_loss": 0.0068359375, + "loss": 0.0083, + "step": 915, + "total_loss": 0.0068359375 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019930873144867887, + "lm_loss": 0.00738525390625, + "loss": 0.0093, + "step": 916, + "total_loss": 0.00738525390625 + }, + { + "epoch": 0.37, + "learning_rate": 0.000199307223050906, + "lm_loss": 0.012451171875, + "loss": 0.0104, + "step": 917, + "total_loss": 0.012451171875 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019930571301493032, + "lm_loss": 0.0126953125, + "loss": 0.0102, + "step": 918, + "total_loss": 0.0126953125 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001993042013407767, + "lm_loss": 0.00872802734375, + "loss": 0.0094, + "step": 919, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019930268802847005, + "lm_loss": 0.020263671875, + "loss": 0.0103, + "step": 920, + "total_loss": 0.020263671875 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019930117307803536, + "lm_loss": 0.005126953125, + "loss": 0.0095, + "step": 921, + "total_loss": 0.005126953125 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019929965648949764, + "lm_loss": 0.0103759765625, + "loss": 0.0087, + "step": 922, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019929813826288187, + "lm_loss": 0.0135498046875, + "loss": 0.008, + "step": 923, + "total_loss": 0.0135498046875 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019929661839821315, + "lm_loss": 0.00433349609375, + "loss": 0.0088, + "step": 924, + "total_loss": 0.00433349609375 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001992950968955165, + "lm_loss": 0.015625, + "loss": 0.0106, + "step": 925, + "total_loss": 0.015625 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019929357375481703, + "lm_loss": 0.01361083984375, + "loss": 0.0097, + "step": 926, + "total_loss": 0.01361083984375 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019929204897613988, + "lm_loss": 0.005401611328125, + "loss": 0.0086, + "step": 927, + "total_loss": 0.005401611328125 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001992905225595102, + "lm_loss": 0.0031890869140625, + "loss": 0.0097, + "step": 928, + "total_loss": 0.0031890869140625 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019928899450495315, + "lm_loss": 0.009521484375, + "loss": 0.01, + "step": 929, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019928746481249397, + "lm_loss": 0.0130615234375, + "loss": 0.0092, + "step": 930, + "total_loss": 0.0130615234375 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019928593348215786, + "lm_loss": 0.00579833984375, + "loss": 0.0105, + "step": 931, + "total_loss": 0.00579833984375 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001992844005139701, + "lm_loss": 0.00787353515625, + "loss": 0.0076, + "step": 932, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019928286590795602, + "lm_loss": 0.01409912109375, + "loss": 0.01, + "step": 933, + "total_loss": 0.01409912109375 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019928132966414084, + "lm_loss": 0.00616455078125, + "loss": 0.0088, + "step": 934, + "total_loss": 0.00616455078125 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019927979178255, + "lm_loss": 0.005584716796875, + "loss": 0.0102, + "step": 935, + "total_loss": 0.005584716796875 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019927825226320877, + "lm_loss": 0.01202392578125, + "loss": 0.0077, + "step": 936, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019927671110614266, + "lm_loss": 0.0068359375, + "loss": 0.008, + "step": 937, + "total_loss": 0.0068359375 + }, + { + "epoch": 0.38, + "learning_rate": 0.000199275168311377, + "lm_loss": 0.01129150390625, + "loss": 0.0079, + "step": 938, + "total_loss": 0.01129150390625 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019927362387893725, + "lm_loss": 0.01361083984375, + "loss": 0.0108, + "step": 939, + "total_loss": 0.01361083984375 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019927207780884895, + "lm_loss": 0.00775146484375, + "loss": 0.0086, + "step": 940, + "total_loss": 0.00775146484375 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019927053010113758, + "lm_loss": 0.00958251953125, + "loss": 0.0089, + "step": 941, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019926898075582863, + "lm_loss": 0.00299072265625, + "loss": 0.0086, + "step": 942, + "total_loss": 0.00299072265625 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001992674297729477, + "lm_loss": 0.00445556640625, + "loss": 0.0095, + "step": 943, + "total_loss": 0.00445556640625 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019926587715252033, + "lm_loss": 0.01025390625, + "loss": 0.01, + "step": 944, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001992643228945722, + "lm_loss": 0.00848388671875, + "loss": 0.01, + "step": 945, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001992627669991289, + "lm_loss": 0.00628662109375, + "loss": 0.0099, + "step": 946, + "total_loss": 0.00628662109375 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019926120946621613, + "lm_loss": 0.021240234375, + "loss": 0.0098, + "step": 947, + "total_loss": 0.021240234375 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019925965029585956, + "lm_loss": 0.01318359375, + "loss": 0.008, + "step": 948, + "total_loss": 0.01318359375 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001992580894880849, + "lm_loss": 0.009033203125, + "loss": 0.0088, + "step": 949, + "total_loss": 0.009033203125 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001992565270429179, + "lm_loss": 0.0264892578125, + "loss": 0.0101, + "step": 950, + "total_loss": 0.0264892578125 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019925496296038437, + "lm_loss": 0.011962890625, + "loss": 0.0089, + "step": 951, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019925339724051008, + "lm_loss": 0.01336669921875, + "loss": 0.0095, + "step": 952, + "total_loss": 0.01336669921875 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001992518298833209, + "lm_loss": 0.00933837890625, + "loss": 0.009, + "step": 953, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001992502608888426, + "lm_loss": 0.006866455078125, + "loss": 0.0095, + "step": 954, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019924869025710114, + "lm_loss": 0.0086669921875, + "loss": 0.0083, + "step": 955, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019924711798812242, + "lm_loss": 0.004119873046875, + "loss": 0.0098, + "step": 956, + "total_loss": 0.004119873046875 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019924554408193233, + "lm_loss": 0.018798828125, + "loss": 0.0088, + "step": 957, + "total_loss": 0.018798828125 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019924396853855688, + "lm_loss": 0.01116943359375, + "loss": 0.0083, + "step": 958, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019924239135802205, + "lm_loss": 0.0069580078125, + "loss": 0.0089, + "step": 959, + "total_loss": 0.0069580078125 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019924081254035384, + "lm_loss": 0.01177978515625, + "loss": 0.0118, + "step": 960, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019923923208557833, + "lm_loss": 0.00958251953125, + "loss": 0.0087, + "step": 961, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019923764999372153, + "lm_loss": 0.014404296875, + "loss": 0.0108, + "step": 962, + "total_loss": 0.014404296875 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019923606626480963, + "lm_loss": 0.0034027099609375, + "loss": 0.0092, + "step": 963, + "total_loss": 0.0034027099609375 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001992344808988687, + "lm_loss": 0.00531005859375, + "loss": 0.0085, + "step": 964, + "total_loss": 0.00531005859375 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019923289389592487, + "lm_loss": 0.0028839111328125, + "loss": 0.0088, + "step": 965, + "total_loss": 0.0028839111328125 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019923130525600436, + "lm_loss": 0.0145263671875, + "loss": 0.0107, + "step": 966, + "total_loss": 0.0145263671875 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019922971497913334, + "lm_loss": 0.0084228515625, + "loss": 0.0094, + "step": 967, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019922812306533808, + "lm_loss": 0.006103515625, + "loss": 0.0088, + "step": 968, + "total_loss": 0.006103515625 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019922652951464485, + "lm_loss": 0.0029296875, + "loss": 0.0084, + "step": 969, + "total_loss": 0.0029296875 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019922493432707989, + "lm_loss": 0.019287109375, + "loss": 0.0091, + "step": 970, + "total_loss": 0.019287109375 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019922333750266958, + "lm_loss": 0.007781982421875, + "loss": 0.0091, + "step": 971, + "total_loss": 0.007781982421875 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019922173904144018, + "lm_loss": 0.0167236328125, + "loss": 0.0106, + "step": 972, + "total_loss": 0.0167236328125 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001992201389434181, + "lm_loss": 0.0120849609375, + "loss": 0.0093, + "step": 973, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019921853720862974, + "lm_loss": 0.004241943359375, + "loss": 0.0068, + "step": 974, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019921693383710153, + "lm_loss": 0.0093994140625, + "loss": 0.0086, + "step": 975, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001992153288288599, + "lm_loss": 0.01123046875, + "loss": 0.0089, + "step": 976, + "total_loss": 0.01123046875 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019921372218393134, + "lm_loss": 0.00628662109375, + "loss": 0.009, + "step": 977, + "total_loss": 0.00628662109375 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019921211390234235, + "lm_loss": 0.009033203125, + "loss": 0.0099, + "step": 978, + "total_loss": 0.009033203125 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019921050398411948, + "lm_loss": 0.0133056640625, + "loss": 0.0106, + "step": 979, + "total_loss": 0.0133056640625 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019920889242928925, + "lm_loss": 0.00616455078125, + "loss": 0.0092, + "step": 980, + "total_loss": 0.00616455078125 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019920727923787825, + "lm_loss": 0.012939453125, + "loss": 0.007, + "step": 981, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019920566440991312, + "lm_loss": 0.01025390625, + "loss": 0.0084, + "step": 982, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019920404794542047, + "lm_loss": 0.019287109375, + "loss": 0.0102, + "step": 983, + "total_loss": 0.019287109375 + }, + { + "epoch": 0.4, + "learning_rate": 0.000199202429844427, + "lm_loss": 0.0166015625, + "loss": 0.0104, + "step": 984, + "total_loss": 0.0166015625 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019920081010695937, + "lm_loss": 0.005340576171875, + "loss": 0.008, + "step": 985, + "total_loss": 0.005340576171875 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001991991887330443, + "lm_loss": 0.00982666015625, + "loss": 0.0078, + "step": 986, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019919756572270856, + "lm_loss": 0.010009765625, + "loss": 0.0111, + "step": 987, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001991959410759789, + "lm_loss": 0.006134033203125, + "loss": 0.0089, + "step": 988, + "total_loss": 0.006134033203125 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019919431479288214, + "lm_loss": 0.00927734375, + "loss": 0.0095, + "step": 989, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001991926868734451, + "lm_loss": 0.01446533203125, + "loss": 0.0099, + "step": 990, + "total_loss": 0.01446533203125 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019919105731769463, + "lm_loss": 0.00787353515625, + "loss": 0.0088, + "step": 991, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019918942612565763, + "lm_loss": 0.00872802734375, + "loss": 0.0084, + "step": 992, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019918779329736096, + "lm_loss": 0.00677490234375, + "loss": 0.0085, + "step": 993, + "total_loss": 0.00677490234375 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019918615883283164, + "lm_loss": 0.00946044921875, + "loss": 0.0094, + "step": 994, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019918452273209655, + "lm_loss": 0.00714111328125, + "loss": 0.008, + "step": 995, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019918288499518272, + "lm_loss": 0.00885009765625, + "loss": 0.0086, + "step": 996, + "total_loss": 0.00885009765625 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019918124562211714, + "lm_loss": 0.00860595703125, + "loss": 0.0099, + "step": 997, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019917960461292693, + "lm_loss": 0.0115966796875, + "loss": 0.0086, + "step": 998, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019917796196763904, + "lm_loss": 0.01025390625, + "loss": 0.0105, + "step": 999, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019917631768628068, + "lm_loss": 0.00860595703125, + "loss": 0.009, + "step": 1000, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.41, + "eval_lm_loss": 0.01070873811841011, + "eval_loss": 0.011119727976620197, + "eval_runtime": 43.981, + "eval_samples_per_second": 22.737, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.01070873811841011, + "lm_loss": 0.00102996826171875, + "step": 1000, + "total_loss": 0.00102996826171875 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001991746717688789, + "lm_loss": 0.005706787109375, + "loss": 0.01, + "step": 1001, + "total_loss": 0.005706787109375 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019917302421546088, + "lm_loss": 0.0086669921875, + "loss": 0.0103, + "step": 1002, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019917137502605377, + "lm_loss": 0.007476806640625, + "loss": 0.0076, + "step": 1003, + "total_loss": 0.007476806640625 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019916972420068482, + "lm_loss": 0.007293701171875, + "loss": 0.0085, + "step": 1004, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019916807173938126, + "lm_loss": 0.0098876953125, + "loss": 0.0077, + "step": 1005, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019916641764217033, + "lm_loss": 0.00604248046875, + "loss": 0.0098, + "step": 1006, + "total_loss": 0.00604248046875 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019916476190907928, + "lm_loss": 0.01531982421875, + "loss": 0.0091, + "step": 1007, + "total_loss": 0.01531982421875 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001991631045401355, + "lm_loss": 0.01214599609375, + "loss": 0.0094, + "step": 1008, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001991614455353663, + "lm_loss": 0.004638671875, + "loss": 0.0084, + "step": 1009, + "total_loss": 0.004638671875 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019915978489479903, + "lm_loss": 0.00927734375, + "loss": 0.009, + "step": 1010, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001991581226184611, + "lm_loss": 0.0093994140625, + "loss": 0.0088, + "step": 1011, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019915645870637993, + "lm_loss": 0.01220703125, + "loss": 0.0094, + "step": 1012, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019915479315858297, + "lm_loss": 0.0106201171875, + "loss": 0.0091, + "step": 1013, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019915312597509766, + "lm_loss": 0.018310546875, + "loss": 0.0106, + "step": 1014, + "total_loss": 0.018310546875 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019915145715595153, + "lm_loss": 0.00927734375, + "loss": 0.0092, + "step": 1015, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019914978670117214, + "lm_loss": 0.00836181640625, + "loss": 0.0094, + "step": 1016, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019914811461078702, + "lm_loss": 0.0089111328125, + "loss": 0.0078, + "step": 1017, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019914644088482375, + "lm_loss": 0.01025390625, + "loss": 0.0097, + "step": 1018, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019914476552330995, + "lm_loss": 0.0098876953125, + "loss": 0.0103, + "step": 1019, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019914308852627324, + "lm_loss": 0.006256103515625, + "loss": 0.0098, + "step": 1020, + "total_loss": 0.006256103515625 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019914140989374127, + "lm_loss": 0.0120849609375, + "loss": 0.0102, + "step": 1021, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001991397296257418, + "lm_loss": 0.018798828125, + "loss": 0.0099, + "step": 1022, + "total_loss": 0.018798828125 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019913804772230247, + "lm_loss": 0.0120849609375, + "loss": 0.0109, + "step": 1023, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019913636418345107, + "lm_loss": 0.00787353515625, + "loss": 0.0098, + "step": 1024, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019913467900921537, + "lm_loss": 0.0208740234375, + "loss": 0.0089, + "step": 1025, + "total_loss": 0.0208740234375 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019913299219962314, + "lm_loss": 0.004486083984375, + "loss": 0.0077, + "step": 1026, + "total_loss": 0.004486083984375 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019913130375470223, + "lm_loss": 0.01348876953125, + "loss": 0.0096, + "step": 1027, + "total_loss": 0.01348876953125 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001991296136744805, + "lm_loss": 0.01318359375, + "loss": 0.0068, + "step": 1028, + "total_loss": 0.01318359375 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001991279219589858, + "lm_loss": 0.00848388671875, + "loss": 0.0086, + "step": 1029, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019912622860824607, + "lm_loss": 0.01068115234375, + "loss": 0.0091, + "step": 1030, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019912453362228925, + "lm_loss": 0.0096435546875, + "loss": 0.0112, + "step": 1031, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019912283700114324, + "lm_loss": 0.0087890625, + "loss": 0.0102, + "step": 1032, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001991211387448361, + "lm_loss": 0.009765625, + "loss": 0.0087, + "step": 1033, + "total_loss": 0.009765625 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001991194388533958, + "lm_loss": 0.00494384765625, + "loss": 0.0086, + "step": 1034, + "total_loss": 0.00494384765625 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001991177373268504, + "lm_loss": 0.007568359375, + "loss": 0.0093, + "step": 1035, + "total_loss": 0.007568359375 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019911603416522796, + "lm_loss": 0.01513671875, + "loss": 0.0084, + "step": 1036, + "total_loss": 0.01513671875 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019911432936855662, + "lm_loss": 0.01251220703125, + "loss": 0.0105, + "step": 1037, + "total_loss": 0.01251220703125 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001991126229368644, + "lm_loss": 0.00946044921875, + "loss": 0.009, + "step": 1038, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019911091487017957, + "lm_loss": 0.01007080078125, + "loss": 0.0065, + "step": 1039, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019910920516853024, + "lm_loss": 0.00848388671875, + "loss": 0.0097, + "step": 1040, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001991074938319446, + "lm_loss": 0.0162353515625, + "loss": 0.0094, + "step": 1041, + "total_loss": 0.0162353515625 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019910578086045094, + "lm_loss": 0.00799560546875, + "loss": 0.0083, + "step": 1042, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019910406625407744, + "lm_loss": 0.013671875, + "loss": 0.009, + "step": 1043, + "total_loss": 0.013671875 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019910235001285246, + "lm_loss": 0.006866455078125, + "loss": 0.0078, + "step": 1044, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019910063213680426, + "lm_loss": 0.006988525390625, + "loss": 0.0072, + "step": 1045, + "total_loss": 0.006988525390625 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019909891262596122, + "lm_loss": 0.01104736328125, + "loss": 0.0099, + "step": 1046, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019909719148035167, + "lm_loss": 0.012451171875, + "loss": 0.0079, + "step": 1047, + "total_loss": 0.012451171875 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019909546870000398, + "lm_loss": 0.0196533203125, + "loss": 0.0096, + "step": 1048, + "total_loss": 0.0196533203125 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019909374428494668, + "lm_loss": 0.006866455078125, + "loss": 0.0111, + "step": 1049, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001990920182352081, + "lm_loss": 0.019287109375, + "loss": 0.011, + "step": 1050, + "total_loss": 0.019287109375 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019909029055081674, + "lm_loss": 0.006439208984375, + "loss": 0.0077, + "step": 1051, + "total_loss": 0.006439208984375 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019908856123180115, + "lm_loss": 0.00933837890625, + "loss": 0.008, + "step": 1052, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001990868302781898, + "lm_loss": 0.007171630859375, + "loss": 0.0109, + "step": 1053, + "total_loss": 0.007171630859375 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019908509769001127, + "lm_loss": 0.0027923583984375, + "loss": 0.0089, + "step": 1054, + "total_loss": 0.0027923583984375 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019908336346729412, + "lm_loss": 0.003814697265625, + "loss": 0.0101, + "step": 1055, + "total_loss": 0.003814697265625 + }, + { + "epoch": 0.43, + "learning_rate": 0.000199081627610067, + "lm_loss": 0.01171875, + "loss": 0.0118, + "step": 1056, + "total_loss": 0.01171875 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019907989011835856, + "lm_loss": 0.024658203125, + "loss": 0.0096, + "step": 1057, + "total_loss": 0.024658203125 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019907815099219736, + "lm_loss": 0.020751953125, + "loss": 0.0077, + "step": 1058, + "total_loss": 0.020751953125 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019907641023161218, + "lm_loss": 0.007049560546875, + "loss": 0.0083, + "step": 1059, + "total_loss": 0.007049560546875 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019907466783663174, + "lm_loss": 0.00811767578125, + "loss": 0.011, + "step": 1060, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019907292380728472, + "lm_loss": 0.0079345703125, + "loss": 0.0218, + "step": 1061, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019907117814359993, + "lm_loss": 0.0074462890625, + "loss": 0.0098, + "step": 1062, + "total_loss": 0.0074462890625 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019906943084560617, + "lm_loss": 0.00811767578125, + "loss": 0.0091, + "step": 1063, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019906768191333223, + "lm_loss": 0.006744384765625, + "loss": 0.0086, + "step": 1064, + "total_loss": 0.006744384765625 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019906593134680697, + "lm_loss": 0.00787353515625, + "loss": 0.0098, + "step": 1065, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019906417914605933, + "lm_loss": 0.0079345703125, + "loss": 0.0104, + "step": 1066, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019906242531111815, + "lm_loss": 0.01116943359375, + "loss": 0.0088, + "step": 1067, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019906066984201238, + "lm_loss": 0.004608154296875, + "loss": 0.0088, + "step": 1068, + "total_loss": 0.004608154296875 + }, + { + "epoch": 0.44, + "learning_rate": 0.000199058912738771, + "lm_loss": 0.01348876953125, + "loss": 0.0103, + "step": 1069, + "total_loss": 0.01348876953125 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001990571540014229, + "lm_loss": 0.00823974609375, + "loss": 0.0102, + "step": 1070, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019905539362999723, + "lm_loss": 0.00836181640625, + "loss": 0.0077, + "step": 1071, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019905363162452298, + "lm_loss": 0.0078125, + "loss": 0.0089, + "step": 1072, + "total_loss": 0.0078125 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019905186798502915, + "lm_loss": 0.0198974609375, + "loss": 0.0104, + "step": 1073, + "total_loss": 0.0198974609375 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001990501027115449, + "lm_loss": 0.007354736328125, + "loss": 0.0098, + "step": 1074, + "total_loss": 0.007354736328125 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019904833580409935, + "lm_loss": 0.006927490234375, + "loss": 0.0073, + "step": 1075, + "total_loss": 0.006927490234375 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019904656726272166, + "lm_loss": 0.009521484375, + "loss": 0.0079, + "step": 1076, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019904479708744095, + "lm_loss": 0.01318359375, + "loss": 0.0078, + "step": 1077, + "total_loss": 0.01318359375 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019904302527828645, + "lm_loss": 0.006256103515625, + "loss": 0.0109, + "step": 1078, + "total_loss": 0.006256103515625 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019904125183528739, + "lm_loss": 0.01300048828125, + "loss": 0.0092, + "step": 1079, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019903947675847304, + "lm_loss": 0.00933837890625, + "loss": 0.0085, + "step": 1080, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019903770004787264, + "lm_loss": 0.0128173828125, + "loss": 0.0113, + "step": 1081, + "total_loss": 0.0128173828125 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019903592170351553, + "lm_loss": 0.00732421875, + "loss": 0.0092, + "step": 1082, + "total_loss": 0.00732421875 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019903414172543104, + "lm_loss": 0.0159912109375, + "loss": 0.0107, + "step": 1083, + "total_loss": 0.0159912109375 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019903236011364854, + "lm_loss": 0.00555419921875, + "loss": 0.0074, + "step": 1084, + "total_loss": 0.00555419921875 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019903057686819744, + "lm_loss": 0.0086669921875, + "loss": 0.012, + "step": 1085, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001990287919891071, + "lm_loss": 0.01055908203125, + "loss": 0.0104, + "step": 1086, + "total_loss": 0.01055908203125 + }, + { + "epoch": 0.44, + "learning_rate": 0.000199027005476407, + "lm_loss": 0.00567626953125, + "loss": 0.0104, + "step": 1087, + "total_loss": 0.00567626953125 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001990252173301266, + "lm_loss": 0.0108642578125, + "loss": 0.012, + "step": 1088, + "total_loss": 0.0108642578125 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019902342755029543, + "lm_loss": 0.01336669921875, + "loss": 0.0109, + "step": 1089, + "total_loss": 0.01336669921875 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019902163613694294, + "lm_loss": 0.01483154296875, + "loss": 0.0083, + "step": 1090, + "total_loss": 0.01483154296875 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019901984309009878, + "lm_loss": 0.004302978515625, + "loss": 0.0097, + "step": 1091, + "total_loss": 0.004302978515625 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019901804840979245, + "lm_loss": 0.01214599609375, + "loss": 0.0086, + "step": 1092, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019901625209605358, + "lm_loss": 0.02587890625, + "loss": 0.0099, + "step": 1093, + "total_loss": 0.02587890625 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019901445414891185, + "lm_loss": 0.00994873046875, + "loss": 0.0086, + "step": 1094, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019901265456839684, + "lm_loss": 0.003082275390625, + "loss": 0.0091, + "step": 1095, + "total_loss": 0.003082275390625 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019901085335453827, + "lm_loss": 0.0057373046875, + "loss": 0.0108, + "step": 1096, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019900905050736589, + "lm_loss": 0.0128173828125, + "loss": 0.0079, + "step": 1097, + "total_loss": 0.0128173828125 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001990072460269094, + "lm_loss": 0.0084228515625, + "loss": 0.0079, + "step": 1098, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019900543991319854, + "lm_loss": 0.00860595703125, + "loss": 0.0095, + "step": 1099, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019900363216626316, + "lm_loss": 0.0089111328125, + "loss": 0.0103, + "step": 1100, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.45, + "eval_lm_loss": 0.01112065464258194, + "eval_loss": 0.011493873782455921, + "eval_runtime": 44.0156, + "eval_samples_per_second": 22.719, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.01112065464258194, + "lm_loss": 0.000705718994140625, + "step": 1100, + "total_loss": 0.000705718994140625 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019900182278613306, + "lm_loss": 0.00921630859375, + "loss": 0.0099, + "step": 1101, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001990000117728381, + "lm_loss": 0.0089111328125, + "loss": 0.0082, + "step": 1102, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019899819912640813, + "lm_loss": 0.006805419921875, + "loss": 0.0084, + "step": 1103, + "total_loss": 0.006805419921875 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019899638484687304, + "lm_loss": 0.0057373046875, + "loss": 0.0096, + "step": 1104, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019899456893426282, + "lm_loss": 0.01104736328125, + "loss": 0.0092, + "step": 1105, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001989927513886074, + "lm_loss": 0.0216064453125, + "loss": 0.0131, + "step": 1106, + "total_loss": 0.0216064453125 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019899093220993673, + "lm_loss": 0.0126953125, + "loss": 0.0084, + "step": 1107, + "total_loss": 0.0126953125 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019898911139828082, + "lm_loss": 0.0067138671875, + "loss": 0.0071, + "step": 1108, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019898728895366975, + "lm_loss": 0.004364013671875, + "loss": 0.0088, + "step": 1109, + "total_loss": 0.004364013671875 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019898546487613357, + "lm_loss": 0.0067138671875, + "loss": 0.0093, + "step": 1110, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019898363916570234, + "lm_loss": 0.0028228759765625, + "loss": 0.009, + "step": 1111, + "total_loss": 0.0028228759765625 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019898181182240624, + "lm_loss": 0.00982666015625, + "loss": 0.0091, + "step": 1112, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019897998284627532, + "lm_loss": 0.003326416015625, + "loss": 0.0085, + "step": 1113, + "total_loss": 0.003326416015625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019897815223733983, + "lm_loss": 0.01239013671875, + "loss": 0.0103, + "step": 1114, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019897631999562994, + "lm_loss": 0.01275634765625, + "loss": 0.0082, + "step": 1115, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019897448612117593, + "lm_loss": 0.01446533203125, + "loss": 0.0099, + "step": 1116, + "total_loss": 0.01446533203125 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019897265061400793, + "lm_loss": 0.0078125, + "loss": 0.0073, + "step": 1117, + "total_loss": 0.0078125 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019897081347415634, + "lm_loss": 0.0133056640625, + "loss": 0.0075, + "step": 1118, + "total_loss": 0.0133056640625 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001989689747016514, + "lm_loss": 0.008056640625, + "loss": 0.0104, + "step": 1119, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019896713429652345, + "lm_loss": 0.00738525390625, + "loss": 0.0106, + "step": 1120, + "total_loss": 0.00738525390625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019896529225880288, + "lm_loss": 0.007720947265625, + "loss": 0.0083, + "step": 1121, + "total_loss": 0.007720947265625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019896344858852003, + "lm_loss": 0.01220703125, + "loss": 0.0099, + "step": 1122, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019896160328570538, + "lm_loss": 0.0106201171875, + "loss": 0.0083, + "step": 1123, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001989597563503893, + "lm_loss": 0.007476806640625, + "loss": 0.0121, + "step": 1124, + "total_loss": 0.007476806640625 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001989579077826023, + "lm_loss": 0.004241943359375, + "loss": 0.0098, + "step": 1125, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019895605758237485, + "lm_loss": 0.00714111328125, + "loss": 0.0088, + "step": 1126, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001989542057497375, + "lm_loss": 0.007110595703125, + "loss": 0.012, + "step": 1127, + "total_loss": 0.007110595703125 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019895235228472078, + "lm_loss": 0.00823974609375, + "loss": 0.0103, + "step": 1128, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019895049718735526, + "lm_loss": 0.00787353515625, + "loss": 0.0086, + "step": 1129, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019894864045767158, + "lm_loss": 0.00543212890625, + "loss": 0.0089, + "step": 1130, + "total_loss": 0.00543212890625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019894678209570027, + "lm_loss": 0.00799560546875, + "loss": 0.0085, + "step": 1131, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019894492210147213, + "lm_loss": 0.01092529296875, + "loss": 0.0097, + "step": 1132, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019894306047501772, + "lm_loss": 0.0091552734375, + "loss": 0.0095, + "step": 1133, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019894119721636783, + "lm_loss": 0.0103759765625, + "loss": 0.0111, + "step": 1134, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019893933232555314, + "lm_loss": 0.00408935546875, + "loss": 0.0089, + "step": 1135, + "total_loss": 0.00408935546875 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019893746580260446, + "lm_loss": 0.0098876953125, + "loss": 0.0086, + "step": 1136, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019893559764755256, + "lm_loss": 0.004730224609375, + "loss": 0.0104, + "step": 1137, + "total_loss": 0.004730224609375 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019893372786042827, + "lm_loss": 0.0186767578125, + "loss": 0.0103, + "step": 1138, + "total_loss": 0.0186767578125 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001989318564412624, + "lm_loss": 0.003936767578125, + "loss": 0.0085, + "step": 1139, + "total_loss": 0.003936767578125 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019892998339008587, + "lm_loss": 0.00750732421875, + "loss": 0.011, + "step": 1140, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019892810870692952, + "lm_loss": 0.0098876953125, + "loss": 0.0093, + "step": 1141, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001989262323918243, + "lm_loss": 0.00799560546875, + "loss": 0.0079, + "step": 1142, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001989243544448012, + "lm_loss": 0.0089111328125, + "loss": 0.0085, + "step": 1143, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019892247486589116, + "lm_loss": 0.0086669921875, + "loss": 0.0116, + "step": 1144, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019892059365512522, + "lm_loss": 0.010009765625, + "loss": 0.0076, + "step": 1145, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019891871081253434, + "lm_loss": 0.0111083984375, + "loss": 0.0072, + "step": 1146, + "total_loss": 0.0111083984375 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019891682633814964, + "lm_loss": 0.00653076171875, + "loss": 0.0082, + "step": 1147, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019891494023200224, + "lm_loss": 0.01507568359375, + "loss": 0.0117, + "step": 1148, + "total_loss": 0.01507568359375 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019891305249412317, + "lm_loss": 0.00872802734375, + "loss": 0.0073, + "step": 1149, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019891116312454362, + "lm_loss": 0.0155029296875, + "loss": 0.0086, + "step": 1150, + "total_loss": 0.0155029296875 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019890927212329475, + "lm_loss": 0.01336669921875, + "loss": 0.01, + "step": 1151, + "total_loss": 0.01336669921875 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019890737949040775, + "lm_loss": 0.004791259765625, + "loss": 0.0068, + "step": 1152, + "total_loss": 0.004791259765625 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019890548522591385, + "lm_loss": 0.006622314453125, + "loss": 0.0087, + "step": 1153, + "total_loss": 0.006622314453125 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001989035893298443, + "lm_loss": 0.0108642578125, + "loss": 0.0074, + "step": 1154, + "total_loss": 0.0108642578125 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019890169180223033, + "lm_loss": 0.004180908203125, + "loss": 0.0074, + "step": 1155, + "total_loss": 0.004180908203125 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001988997926431033, + "lm_loss": 0.00628662109375, + "loss": 0.0084, + "step": 1156, + "total_loss": 0.00628662109375 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019889789185249451, + "lm_loss": 0.00823974609375, + "loss": 0.0108, + "step": 1157, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019889598943043535, + "lm_loss": 0.004608154296875, + "loss": 0.0098, + "step": 1158, + "total_loss": 0.004608154296875 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019889408537695716, + "lm_loss": 0.0069580078125, + "loss": 0.0075, + "step": 1159, + "total_loss": 0.0069580078125 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019889217969209139, + "lm_loss": 0.0155029296875, + "loss": 0.0093, + "step": 1160, + "total_loss": 0.0155029296875 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001988902723758694, + "lm_loss": 0.014892578125, + "loss": 0.0097, + "step": 1161, + "total_loss": 0.014892578125 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019888836342832277, + "lm_loss": 0.007049560546875, + "loss": 0.0093, + "step": 1162, + "total_loss": 0.007049560546875 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019888645284948288, + "lm_loss": 0.0177001953125, + "loss": 0.0089, + "step": 1163, + "total_loss": 0.0177001953125 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019888454063938133, + "lm_loss": 0.005096435546875, + "loss": 0.0083, + "step": 1164, + "total_loss": 0.005096435546875 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001988826267980496, + "lm_loss": 0.0031890869140625, + "loss": 0.0097, + "step": 1165, + "total_loss": 0.0031890869140625 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019888071132551934, + "lm_loss": 0.00885009765625, + "loss": 0.0086, + "step": 1166, + "total_loss": 0.00885009765625 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019887879422182206, + "lm_loss": 0.006591796875, + "loss": 0.0084, + "step": 1167, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019887687548698942, + "lm_loss": 0.009521484375, + "loss": 0.0099, + "step": 1168, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001988749551210531, + "lm_loss": 0.0086669921875, + "loss": 0.0087, + "step": 1169, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019887303312404476, + "lm_loss": 0.00787353515625, + "loss": 0.009, + "step": 1170, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019887110949599608, + "lm_loss": 0.0034027099609375, + "loss": 0.0076, + "step": 1171, + "total_loss": 0.0034027099609375 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001988691842369388, + "lm_loss": 0.0067138671875, + "loss": 0.01, + "step": 1172, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019886725734690472, + "lm_loss": 0.0045166015625, + "loss": 0.0076, + "step": 1173, + "total_loss": 0.0045166015625 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001988653288259256, + "lm_loss": 0.0137939453125, + "loss": 0.0084, + "step": 1174, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019886339867403326, + "lm_loss": 0.007293701171875, + "loss": 0.0093, + "step": 1175, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019886146689125953, + "lm_loss": 0.01434326171875, + "loss": 0.0117, + "step": 1176, + "total_loss": 0.01434326171875 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001988595334776363, + "lm_loss": 0.006500244140625, + "loss": 0.009, + "step": 1177, + "total_loss": 0.006500244140625 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019885759843319542, + "lm_loss": 0.017822265625, + "loss": 0.0086, + "step": 1178, + "total_loss": 0.017822265625 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019885566175796882, + "lm_loss": 0.007293701171875, + "loss": 0.0084, + "step": 1179, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001988537234519885, + "lm_loss": 0.00445556640625, + "loss": 0.0083, + "step": 1180, + "total_loss": 0.00445556640625 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019885178351528642, + "lm_loss": 0.00787353515625, + "loss": 0.0091, + "step": 1181, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001988498419478945, + "lm_loss": 0.0164794921875, + "loss": 0.0083, + "step": 1182, + "total_loss": 0.0164794921875 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019884789874984485, + "lm_loss": 0.00921630859375, + "loss": 0.0078, + "step": 1183, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019884595392116955, + "lm_loss": 0.01300048828125, + "loss": 0.0092, + "step": 1184, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019884400746190059, + "lm_loss": 0.0054931640625, + "loss": 0.0079, + "step": 1185, + "total_loss": 0.0054931640625 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019884205937207016, + "lm_loss": 0.00836181640625, + "loss": 0.011, + "step": 1186, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019884010965171033, + "lm_loss": 0.01239013671875, + "loss": 0.0093, + "step": 1187, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019883815830085334, + "lm_loss": 0.0081787109375, + "loss": 0.0081, + "step": 1188, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019883620531953132, + "lm_loss": 0.00689697265625, + "loss": 0.0092, + "step": 1189, + "total_loss": 0.00689697265625 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001988342507077765, + "lm_loss": 0.01214599609375, + "loss": 0.008, + "step": 1190, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019883229446562113, + "lm_loss": 0.003936767578125, + "loss": 0.0081, + "step": 1191, + "total_loss": 0.003936767578125 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001988303365930975, + "lm_loss": 0.01104736328125, + "loss": 0.0088, + "step": 1192, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019882837709023785, + "lm_loss": 0.0078125, + "loss": 0.007, + "step": 1193, + "total_loss": 0.0078125 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019882641595707457, + "lm_loss": 0.01220703125, + "loss": 0.0089, + "step": 1194, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019882445319363998, + "lm_loss": 0.00909423828125, + "loss": 0.0087, + "step": 1195, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019882248879996643, + "lm_loss": 0.0093994140625, + "loss": 0.0098, + "step": 1196, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001988205227760864, + "lm_loss": 0.00830078125, + "loss": 0.0088, + "step": 1197, + "total_loss": 0.00830078125 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019881855512203224, + "lm_loss": 0.003326416015625, + "loss": 0.0078, + "step": 1198, + "total_loss": 0.003326416015625 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019881658583783648, + "lm_loss": 0.0050048828125, + "loss": 0.0111, + "step": 1199, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019881461492353157, + "lm_loss": 0.01190185546875, + "loss": 0.0103, + "step": 1200, + "total_loss": 0.01190185546875 + }, + { + "epoch": 0.49, + "eval_lm_loss": 0.010570013895630836, + "eval_loss": 0.011091976426541805, + "eval_runtime": 44.0922, + "eval_samples_per_second": 22.68, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.010570013895630836, + "lm_loss": 0.0012359619140625, + "step": 1200, + "total_loss": 0.0012359619140625 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019881264237915002, + "lm_loss": 0.007537841796875, + "loss": 0.0077, + "step": 1201, + "total_loss": 0.007537841796875 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019881066820472438, + "lm_loss": 0.007232666015625, + "loss": 0.0084, + "step": 1202, + "total_loss": 0.007232666015625 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019880869240028722, + "lm_loss": 0.005279541015625, + "loss": 0.0094, + "step": 1203, + "total_loss": 0.005279541015625 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001988067149658711, + "lm_loss": 0.0035247802734375, + "loss": 0.0089, + "step": 1204, + "total_loss": 0.0035247802734375 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019880473590150867, + "lm_loss": 0.0098876953125, + "loss": 0.0075, + "step": 1205, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001988027552072326, + "lm_loss": 0.00421142578125, + "loss": 0.0109, + "step": 1206, + "total_loss": 0.00421142578125 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019880077288307553, + "lm_loss": 0.0166015625, + "loss": 0.0087, + "step": 1207, + "total_loss": 0.0166015625 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019879878892907016, + "lm_loss": 0.0096435546875, + "loss": 0.0101, + "step": 1208, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019879680334524924, + "lm_loss": 0.01544189453125, + "loss": 0.0094, + "step": 1209, + "total_loss": 0.01544189453125 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001987948161316455, + "lm_loss": 0.00701904296875, + "loss": 0.0085, + "step": 1210, + "total_loss": 0.00701904296875 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019879282728829174, + "lm_loss": 0.0087890625, + "loss": 0.0082, + "step": 1211, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019879083681522075, + "lm_loss": 0.01116943359375, + "loss": 0.0103, + "step": 1212, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001987888447124654, + "lm_loss": 0.006195068359375, + "loss": 0.0093, + "step": 1213, + "total_loss": 0.006195068359375 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001987868509800585, + "lm_loss": 0.007568359375, + "loss": 0.0087, + "step": 1214, + "total_loss": 0.007568359375 + }, + { + "epoch": 0.5, + "learning_rate": 0.000198784855618033, + "lm_loss": 0.0164794921875, + "loss": 0.0096, + "step": 1215, + "total_loss": 0.0164794921875 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019878285862642178, + "lm_loss": 0.007354736328125, + "loss": 0.0087, + "step": 1216, + "total_loss": 0.007354736328125 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019878086000525778, + "lm_loss": 0.0167236328125, + "loss": 0.0093, + "step": 1217, + "total_loss": 0.0167236328125 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019877885975457397, + "lm_loss": 0.00811767578125, + "loss": 0.0078, + "step": 1218, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019877685787440336, + "lm_loss": 0.004974365234375, + "loss": 0.0083, + "step": 1219, + "total_loss": 0.004974365234375 + }, + { + "epoch": 0.5, + "learning_rate": 0.000198774854364779, + "lm_loss": 0.0189208984375, + "loss": 0.01, + "step": 1220, + "total_loss": 0.0189208984375 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019877284922573387, + "lm_loss": 0.005828857421875, + "loss": 0.009, + "step": 1221, + "total_loss": 0.005828857421875 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001987708424573011, + "lm_loss": 0.004241943359375, + "loss": 0.0096, + "step": 1222, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019876883405951377, + "lm_loss": 0.00714111328125, + "loss": 0.0092, + "step": 1223, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019876682403240504, + "lm_loss": 0.003173828125, + "loss": 0.0071, + "step": 1224, + "total_loss": 0.003173828125 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019876481237600805, + "lm_loss": 0.0150146484375, + "loss": 0.0096, + "step": 1225, + "total_loss": 0.0150146484375 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019876279909035598, + "lm_loss": 0.00909423828125, + "loss": 0.01, + "step": 1226, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019876078417548206, + "lm_loss": 0.01165771484375, + "loss": 0.0094, + "step": 1227, + "total_loss": 0.01165771484375 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001987587676314195, + "lm_loss": 0.00482177734375, + "loss": 0.0079, + "step": 1228, + "total_loss": 0.00482177734375 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019875674945820157, + "lm_loss": 0.00823974609375, + "loss": 0.0093, + "step": 1229, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019875472965586157, + "lm_loss": 0.01287841796875, + "loss": 0.0105, + "step": 1230, + "total_loss": 0.01287841796875 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019875270822443286, + "lm_loss": 0.015869140625, + "loss": 0.0109, + "step": 1231, + "total_loss": 0.015869140625 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019875068516394872, + "lm_loss": 0.0125732421875, + "loss": 0.0086, + "step": 1232, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019874866047444255, + "lm_loss": 0.01708984375, + "loss": 0.0084, + "step": 1233, + "total_loss": 0.01708984375 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019874663415594778, + "lm_loss": 0.01251220703125, + "loss": 0.0105, + "step": 1234, + "total_loss": 0.01251220703125 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001987446062084978, + "lm_loss": 0.004669189453125, + "loss": 0.0087, + "step": 1235, + "total_loss": 0.004669189453125 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019874257663212604, + "lm_loss": 0.00494384765625, + "loss": 0.0078, + "step": 1236, + "total_loss": 0.00494384765625 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019874054542686604, + "lm_loss": 0.00958251953125, + "loss": 0.0088, + "step": 1237, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019873851259275127, + "lm_loss": 0.0118408203125, + "loss": 0.0078, + "step": 1238, + "total_loss": 0.0118408203125 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019873647812981528, + "lm_loss": 0.00909423828125, + "loss": 0.0106, + "step": 1239, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019873444203809165, + "lm_loss": 0.00946044921875, + "loss": 0.0088, + "step": 1240, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019873240431761392, + "lm_loss": 0.01263427734375, + "loss": 0.0094, + "step": 1241, + "total_loss": 0.01263427734375 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001987303649684157, + "lm_loss": 0.00604248046875, + "loss": 0.0095, + "step": 1242, + "total_loss": 0.00604248046875 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001987283239905307, + "lm_loss": 0.007080078125, + "loss": 0.0089, + "step": 1243, + "total_loss": 0.007080078125 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019872628138399253, + "lm_loss": 0.008544921875, + "loss": 0.0079, + "step": 1244, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019872423714883492, + "lm_loss": 0.005462646484375, + "loss": 0.0088, + "step": 1245, + "total_loss": 0.005462646484375 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019872219128509158, + "lm_loss": 0.0050048828125, + "loss": 0.0089, + "step": 1246, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019872014379279624, + "lm_loss": 0.0113525390625, + "loss": 0.0086, + "step": 1247, + "total_loss": 0.0113525390625 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001987180946719827, + "lm_loss": 0.01214599609375, + "loss": 0.0079, + "step": 1248, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019871604392268476, + "lm_loss": 0.0093994140625, + "loss": 0.009, + "step": 1249, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001987139915449362, + "lm_loss": 0.004608154296875, + "loss": 0.0071, + "step": 1250, + "total_loss": 0.004608154296875 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019871193753877097, + "lm_loss": 0.01312255859375, + "loss": 0.0079, + "step": 1251, + "total_loss": 0.01312255859375 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019870988190422292, + "lm_loss": 0.003082275390625, + "loss": 0.009, + "step": 1252, + "total_loss": 0.003082275390625 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001987078246413259, + "lm_loss": 0.0184326171875, + "loss": 0.0094, + "step": 1253, + "total_loss": 0.0184326171875 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001987057657501139, + "lm_loss": 0.0103759765625, + "loss": 0.0092, + "step": 1254, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001987037052306209, + "lm_loss": 0.00994873046875, + "loss": 0.0102, + "step": 1255, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019870164308288083, + "lm_loss": 0.00592041015625, + "loss": 0.0094, + "step": 1256, + "total_loss": 0.00592041015625 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001986995793069278, + "lm_loss": 0.004852294921875, + "loss": 0.0073, + "step": 1257, + "total_loss": 0.004852294921875 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019869751390279576, + "lm_loss": 0.00921630859375, + "loss": 0.0089, + "step": 1258, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019869544687051884, + "lm_loss": 0.00787353515625, + "loss": 0.0069, + "step": 1259, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001986933782101311, + "lm_loss": 0.0072021484375, + "loss": 0.0126, + "step": 1260, + "total_loss": 0.0072021484375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001986913079216667, + "lm_loss": 0.01019287109375, + "loss": 0.0092, + "step": 1261, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019868923600515977, + "lm_loss": 0.0034637451171875, + "loss": 0.0092, + "step": 1262, + "total_loss": 0.0034637451171875 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001986871624606445, + "lm_loss": 0.02001953125, + "loss": 0.0084, + "step": 1263, + "total_loss": 0.02001953125 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019868508728815512, + "lm_loss": 0.017822265625, + "loss": 0.011, + "step": 1264, + "total_loss": 0.017822265625 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001986830104877258, + "lm_loss": 0.00848388671875, + "loss": 0.0068, + "step": 1265, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019868093205939084, + "lm_loss": 0.0023956298828125, + "loss": 0.0075, + "step": 1266, + "total_loss": 0.0023956298828125 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001986788520031845, + "lm_loss": 0.0025177001953125, + "loss": 0.0078, + "step": 1267, + "total_loss": 0.0025177001953125 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019867677031914116, + "lm_loss": 0.00872802734375, + "loss": 0.0086, + "step": 1268, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019867468700729508, + "lm_loss": 0.0036773681640625, + "loss": 0.0081, + "step": 1269, + "total_loss": 0.0036773681640625 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019867260206768067, + "lm_loss": 0.01043701171875, + "loss": 0.0081, + "step": 1270, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019867051550033233, + "lm_loss": 0.00885009765625, + "loss": 0.0102, + "step": 1271, + "total_loss": 0.00885009765625 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019866842730528446, + "lm_loss": 0.008056640625, + "loss": 0.0084, + "step": 1272, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001986663374825715, + "lm_loss": 0.00714111328125, + "loss": 0.0112, + "step": 1273, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019866424603222796, + "lm_loss": 0.00604248046875, + "loss": 0.0092, + "step": 1274, + "total_loss": 0.00604248046875 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001986621529542883, + "lm_loss": 0.00848388671875, + "loss": 0.0089, + "step": 1275, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019866005824878706, + "lm_loss": 0.01251220703125, + "loss": 0.0093, + "step": 1276, + "total_loss": 0.01251220703125 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019865796191575883, + "lm_loss": 0.0050048828125, + "loss": 0.0076, + "step": 1277, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019865586395523814, + "lm_loss": 0.00970458984375, + "loss": 0.0105, + "step": 1278, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019865376436725962, + "lm_loss": 0.013427734375, + "loss": 0.0096, + "step": 1279, + "total_loss": 0.013427734375 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019865166315185794, + "lm_loss": 0.00787353515625, + "loss": 0.0092, + "step": 1280, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019864956030906769, + "lm_loss": 0.007171630859375, + "loss": 0.0082, + "step": 1281, + "total_loss": 0.007171630859375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001986474558389236, + "lm_loss": 0.013916015625, + "loss": 0.0095, + "step": 1282, + "total_loss": 0.013916015625 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001986453497414604, + "lm_loss": 0.003997802734375, + "loss": 0.011, + "step": 1283, + "total_loss": 0.003997802734375 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019864324201671282, + "lm_loss": 0.01239013671875, + "loss": 0.0092, + "step": 1284, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019864113266471564, + "lm_loss": 0.00982666015625, + "loss": 0.0095, + "step": 1285, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019863902168550365, + "lm_loss": 0.0142822265625, + "loss": 0.0095, + "step": 1286, + "total_loss": 0.0142822265625 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019863690907911163, + "lm_loss": 0.004364013671875, + "loss": 0.0076, + "step": 1287, + "total_loss": 0.004364013671875 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019863479484557445, + "lm_loss": 0.01104736328125, + "loss": 0.0094, + "step": 1288, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019863267898492706, + "lm_loss": 0.016357421875, + "loss": 0.0132, + "step": 1289, + "total_loss": 0.016357421875 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001986305614972043, + "lm_loss": 0.00823974609375, + "loss": 0.0089, + "step": 1290, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019862844238244107, + "lm_loss": 0.02685546875, + "loss": 0.0097, + "step": 1291, + "total_loss": 0.02685546875 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001986263216406724, + "lm_loss": 0.0078125, + "loss": 0.01, + "step": 1292, + "total_loss": 0.0078125 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019862419927193324, + "lm_loss": 0.00665283203125, + "loss": 0.0077, + "step": 1293, + "total_loss": 0.00665283203125 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001986220752762586, + "lm_loss": 0.00946044921875, + "loss": 0.0118, + "step": 1294, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019861994965368355, + "lm_loss": 0.00811767578125, + "loss": 0.0081, + "step": 1295, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019861782240424307, + "lm_loss": 0.006561279296875, + "loss": 0.009, + "step": 1296, + "total_loss": 0.006561279296875 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019861569352797232, + "lm_loss": 0.01068115234375, + "loss": 0.0095, + "step": 1297, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019861356302490645, + "lm_loss": 0.0084228515625, + "loss": 0.0094, + "step": 1298, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001986114308950805, + "lm_loss": 0.01171875, + "loss": 0.0074, + "step": 1299, + "total_loss": 0.01171875 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019860929713852975, + "lm_loss": 0.008544921875, + "loss": 0.0099, + "step": 1300, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.53, + "eval_lm_loss": 0.010627499781548977, + "eval_loss": 0.011026458814740181, + "eval_runtime": 44.281, + "eval_samples_per_second": 22.583, + "eval_steps_per_second": 0.203, + "eval_total_loss": 0.010627499781548977, + "lm_loss": 0.000827789306640625, + "step": 1300, + "total_loss": 0.000827789306640625 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019860716175528932, + "lm_loss": 0.006103515625, + "loss": 0.0107, + "step": 1301, + "total_loss": 0.006103515625 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001986050247453945, + "lm_loss": 0.009521484375, + "loss": 0.009, + "step": 1302, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001986028861088805, + "lm_loss": 0.006988525390625, + "loss": 0.0081, + "step": 1303, + "total_loss": 0.006988525390625 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001986007458457826, + "lm_loss": 0.0036468505859375, + "loss": 0.0096, + "step": 1304, + "total_loss": 0.0036468505859375 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019859860395613613, + "lm_loss": 0.0048828125, + "loss": 0.0095, + "step": 1305, + "total_loss": 0.0048828125 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019859646043997642, + "lm_loss": 0.00286865234375, + "loss": 0.008, + "step": 1306, + "total_loss": 0.00286865234375 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019859431529733882, + "lm_loss": 0.01239013671875, + "loss": 0.0083, + "step": 1307, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001985921685282587, + "lm_loss": 0.007476806640625, + "loss": 0.0089, + "step": 1308, + "total_loss": 0.007476806640625 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019859002013277146, + "lm_loss": 0.01116943359375, + "loss": 0.0074, + "step": 1309, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019858787011091262, + "lm_loss": 0.0037994384765625, + "loss": 0.0083, + "step": 1310, + "total_loss": 0.0037994384765625 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001985857184627176, + "lm_loss": 0.00537109375, + "loss": 0.008, + "step": 1311, + "total_loss": 0.00537109375 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019858356518822189, + "lm_loss": 0.0048828125, + "loss": 0.0082, + "step": 1312, + "total_loss": 0.0048828125 + }, + { + "epoch": 0.54, + "learning_rate": 0.000198581410287461, + "lm_loss": 0.007080078125, + "loss": 0.0082, + "step": 1313, + "total_loss": 0.007080078125 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019857925376047048, + "lm_loss": 0.008544921875, + "loss": 0.0083, + "step": 1314, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019857709560728597, + "lm_loss": 0.01458740234375, + "loss": 0.0111, + "step": 1315, + "total_loss": 0.01458740234375 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019857493582794297, + "lm_loss": 0.00445556640625, + "loss": 0.0081, + "step": 1316, + "total_loss": 0.00445556640625 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001985727744224772, + "lm_loss": 0.0030670166015625, + "loss": 0.0087, + "step": 1317, + "total_loss": 0.0030670166015625 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019857061139092423, + "lm_loss": 0.0034332275390625, + "loss": 0.007, + "step": 1318, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001985684467333198, + "lm_loss": 0.007720947265625, + "loss": 0.0073, + "step": 1319, + "total_loss": 0.007720947265625 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019856628044969963, + "lm_loss": 0.0087890625, + "loss": 0.009, + "step": 1320, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001985641125400994, + "lm_loss": 0.005035400390625, + "loss": 0.0092, + "step": 1321, + "total_loss": 0.005035400390625 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001985619430045549, + "lm_loss": 0.010009765625, + "loss": 0.0104, + "step": 1322, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019855977184310195, + "lm_loss": 0.0137939453125, + "loss": 0.0107, + "step": 1323, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001985575990557763, + "lm_loss": 0.0167236328125, + "loss": 0.0092, + "step": 1324, + "total_loss": 0.0167236328125 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019855542464261385, + "lm_loss": 0.0177001953125, + "loss": 0.0086, + "step": 1325, + "total_loss": 0.0177001953125 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019855324860365046, + "lm_loss": 0.0137939453125, + "loss": 0.0104, + "step": 1326, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.54, + "learning_rate": 0.000198551070938922, + "lm_loss": 0.01104736328125, + "loss": 0.0114, + "step": 1327, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019854889164846442, + "lm_loss": 0.004425048828125, + "loss": 0.0087, + "step": 1328, + "total_loss": 0.004425048828125 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019854671073231365, + "lm_loss": 0.006378173828125, + "loss": 0.0086, + "step": 1329, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001985445281905057, + "lm_loss": 0.00482177734375, + "loss": 0.0085, + "step": 1330, + "total_loss": 0.00482177734375 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019854234402307653, + "lm_loss": 0.0030059814453125, + "loss": 0.0084, + "step": 1331, + "total_loss": 0.0030059814453125 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001985401582300622, + "lm_loss": 0.0172119140625, + "loss": 0.0111, + "step": 1332, + "total_loss": 0.0172119140625 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019853797081149878, + "lm_loss": 0.01275634765625, + "loss": 0.0069, + "step": 1333, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001985357817674223, + "lm_loss": 0.0128173828125, + "loss": 0.011, + "step": 1334, + "total_loss": 0.0128173828125 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001985335910978689, + "lm_loss": 0.00604248046875, + "loss": 0.0091, + "step": 1335, + "total_loss": 0.00604248046875 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019853139880287476, + "lm_loss": 0.0145263671875, + "loss": 0.0096, + "step": 1336, + "total_loss": 0.0145263671875 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019852920488247597, + "lm_loss": 0.0106201171875, + "loss": 0.0097, + "step": 1337, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019852700933670878, + "lm_loss": 0.0068359375, + "loss": 0.0085, + "step": 1338, + "total_loss": 0.0068359375 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019852481216560942, + "lm_loss": 0.00830078125, + "loss": 0.0083, + "step": 1339, + "total_loss": 0.00830078125 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019852261336921406, + "lm_loss": 0.00927734375, + "loss": 0.0085, + "step": 1340, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.55, + "learning_rate": 0.000198520412947559, + "lm_loss": 0.0086669921875, + "loss": 0.0073, + "step": 1341, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019851821090068059, + "lm_loss": 0.00787353515625, + "loss": 0.0088, + "step": 1342, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001985160072286151, + "lm_loss": 0.00921630859375, + "loss": 0.0101, + "step": 1343, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019851380193139893, + "lm_loss": 0.01495361328125, + "loss": 0.0088, + "step": 1344, + "total_loss": 0.01495361328125 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001985115950090684, + "lm_loss": 0.007598876953125, + "loss": 0.0078, + "step": 1345, + "total_loss": 0.007598876953125 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019850938646165996, + "lm_loss": 0.00921630859375, + "loss": 0.0099, + "step": 1346, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019850717628921004, + "lm_loss": 0.005859375, + "loss": 0.0101, + "step": 1347, + "total_loss": 0.005859375 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019850496449175508, + "lm_loss": 0.0031890869140625, + "loss": 0.0068, + "step": 1348, + "total_loss": 0.0031890869140625 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019850275106933157, + "lm_loss": 0.0057373046875, + "loss": 0.011, + "step": 1349, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019850053602197605, + "lm_loss": 0.00726318359375, + "loss": 0.0078, + "step": 1350, + "total_loss": 0.00726318359375 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019849831934972502, + "lm_loss": 0.007080078125, + "loss": 0.0105, + "step": 1351, + "total_loss": 0.007080078125 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019849610105261508, + "lm_loss": 0.0038604736328125, + "loss": 0.0089, + "step": 1352, + "total_loss": 0.0038604736328125 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019849388113068282, + "lm_loss": 0.003997802734375, + "loss": 0.0096, + "step": 1353, + "total_loss": 0.003997802734375 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019849165958396483, + "lm_loss": 0.01007080078125, + "loss": 0.0096, + "step": 1354, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001984894364124978, + "lm_loss": 0.008056640625, + "loss": 0.0096, + "step": 1355, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019848721161631837, + "lm_loss": 0.005401611328125, + "loss": 0.01, + "step": 1356, + "total_loss": 0.005401611328125 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019848498519546323, + "lm_loss": 0.0037994384765625, + "loss": 0.008, + "step": 1357, + "total_loss": 0.0037994384765625 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019848275714996915, + "lm_loss": 0.00823974609375, + "loss": 0.0106, + "step": 1358, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001984805274798729, + "lm_loss": 0.00933837890625, + "loss": 0.0088, + "step": 1359, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019847829618521118, + "lm_loss": 0.004974365234375, + "loss": 0.0098, + "step": 1360, + "total_loss": 0.004974365234375 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019847606326602087, + "lm_loss": 0.00970458984375, + "loss": 0.0091, + "step": 1361, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019847382872233875, + "lm_loss": 0.005828857421875, + "loss": 0.0068, + "step": 1362, + "total_loss": 0.005828857421875 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019847159255420173, + "lm_loss": 0.0029144287109375, + "loss": 0.0099, + "step": 1363, + "total_loss": 0.0029144287109375 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001984693547616467, + "lm_loss": 0.00762939453125, + "loss": 0.0072, + "step": 1364, + "total_loss": 0.00762939453125 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019846711534471053, + "lm_loss": 0.0147705078125, + "loss": 0.0106, + "step": 1365, + "total_loss": 0.0147705078125 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019846487430343022, + "lm_loss": 0.0057373046875, + "loss": 0.0097, + "step": 1366, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019846263163784267, + "lm_loss": 0.004638671875, + "loss": 0.0077, + "step": 1367, + "total_loss": 0.004638671875 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019846038734798493, + "lm_loss": 0.0029449462890625, + "loss": 0.0078, + "step": 1368, + "total_loss": 0.0029449462890625 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019845814143389402, + "lm_loss": 0.013427734375, + "loss": 0.0068, + "step": 1369, + "total_loss": 0.013427734375 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019845589389560695, + "lm_loss": 0.01519775390625, + "loss": 0.0112, + "step": 1370, + "total_loss": 0.01519775390625 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001984536447331608, + "lm_loss": 0.005462646484375, + "loss": 0.0093, + "step": 1371, + "total_loss": 0.005462646484375 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019845139394659276, + "lm_loss": 0.007049560546875, + "loss": 0.0099, + "step": 1372, + "total_loss": 0.007049560546875 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019844914153593983, + "lm_loss": 0.017822265625, + "loss": 0.011, + "step": 1373, + "total_loss": 0.017822265625 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019844688750123926, + "lm_loss": 0.00885009765625, + "loss": 0.008, + "step": 1374, + "total_loss": 0.00885009765625 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001984446318425282, + "lm_loss": 0.005889892578125, + "loss": 0.0108, + "step": 1375, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019844237455984387, + "lm_loss": 0.0086669921875, + "loss": 0.0087, + "step": 1376, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001984401156532235, + "lm_loss": 0.01531982421875, + "loss": 0.0099, + "step": 1377, + "total_loss": 0.01531982421875 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019843785512270432, + "lm_loss": 0.0059814453125, + "loss": 0.0073, + "step": 1378, + "total_loss": 0.0059814453125 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019843559296832367, + "lm_loss": 0.01141357421875, + "loss": 0.007, + "step": 1379, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019843332919011888, + "lm_loss": 0.0052490234375, + "loss": 0.0065, + "step": 1380, + "total_loss": 0.0052490234375 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019843106378812723, + "lm_loss": 0.006988525390625, + "loss": 0.0071, + "step": 1381, + "total_loss": 0.006988525390625 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019842879676238614, + "lm_loss": 0.0084228515625, + "loss": 0.0074, + "step": 1382, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.57, + "learning_rate": 0.000198426528112933, + "lm_loss": 0.00787353515625, + "loss": 0.0097, + "step": 1383, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019842425783980522, + "lm_loss": 0.007659912109375, + "loss": 0.0108, + "step": 1384, + "total_loss": 0.007659912109375 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019842198594304024, + "lm_loss": 0.00872802734375, + "loss": 0.0081, + "step": 1385, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001984197124226756, + "lm_loss": 0.0054931640625, + "loss": 0.0085, + "step": 1386, + "total_loss": 0.0054931640625 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019841743727874871, + "lm_loss": 0.0076904296875, + "loss": 0.0094, + "step": 1387, + "total_loss": 0.0076904296875 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001984151605112972, + "lm_loss": 0.0072021484375, + "loss": 0.0096, + "step": 1388, + "total_loss": 0.0072021484375 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019841288212035853, + "lm_loss": 0.0048828125, + "loss": 0.0078, + "step": 1389, + "total_loss": 0.0048828125 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001984106021059704, + "lm_loss": 0.007598876953125, + "loss": 0.0087, + "step": 1390, + "total_loss": 0.007598876953125 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001984083204681703, + "lm_loss": 0.01239013671875, + "loss": 0.0092, + "step": 1391, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019840603720699596, + "lm_loss": 0.004730224609375, + "loss": 0.0111, + "step": 1392, + "total_loss": 0.004730224609375 + }, + { + "epoch": 0.57, + "learning_rate": 0.000198403752322485, + "lm_loss": 0.0050048828125, + "loss": 0.0073, + "step": 1393, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019840146581467512, + "lm_loss": 0.0098876953125, + "loss": 0.0082, + "step": 1394, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019839917768360406, + "lm_loss": 0.005615234375, + "loss": 0.0089, + "step": 1395, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019839688792930952, + "lm_loss": 0.008544921875, + "loss": 0.0072, + "step": 1396, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019839459655182932, + "lm_loss": 0.00775146484375, + "loss": 0.0109, + "step": 1397, + "total_loss": 0.00775146484375 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019839230355120125, + "lm_loss": 0.0037841796875, + "loss": 0.0084, + "step": 1398, + "total_loss": 0.0037841796875 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001983900089274631, + "lm_loss": 0.01055908203125, + "loss": 0.0099, + "step": 1399, + "total_loss": 0.01055908203125 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019838771268065278, + "lm_loss": 0.00830078125, + "loss": 0.0089, + "step": 1400, + "total_loss": 0.00830078125 + }, + { + "epoch": 0.57, + "eval_lm_loss": 0.010626722127199173, + "eval_loss": 0.010994529351592064, + "eval_runtime": 43.9509, + "eval_samples_per_second": 22.753, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.010626722127199173, + "lm_loss": 0.00112152099609375, + "step": 1400, + "total_loss": 0.00112152099609375 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019838541481080813, + "lm_loss": 0.00958251953125, + "loss": 0.0083, + "step": 1401, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019838311531796706, + "lm_loss": 0.00885009765625, + "loss": 0.0094, + "step": 1402, + "total_loss": 0.00885009765625 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019838081420216753, + "lm_loss": 0.00933837890625, + "loss": 0.0073, + "step": 1403, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019837851146344744, + "lm_loss": 0.01336669921875, + "loss": 0.0091, + "step": 1404, + "total_loss": 0.01336669921875 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019837620710184481, + "lm_loss": 0.0052490234375, + "loss": 0.0085, + "step": 1405, + "total_loss": 0.0052490234375 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001983739011173977, + "lm_loss": 0.0135498046875, + "loss": 0.0084, + "step": 1406, + "total_loss": 0.0135498046875 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019837159351014408, + "lm_loss": 0.00848388671875, + "loss": 0.0084, + "step": 1407, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019836928428012204, + "lm_loss": 0.0125732421875, + "loss": 0.0072, + "step": 1408, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019836697342736972, + "lm_loss": 0.0087890625, + "loss": 0.0092, + "step": 1409, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001983646609519252, + "lm_loss": 0.0078125, + "loss": 0.0076, + "step": 1410, + "total_loss": 0.0078125 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019836234685382658, + "lm_loss": 0.0079345703125, + "loss": 0.0079, + "step": 1411, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001983600311331121, + "lm_loss": 0.00811767578125, + "loss": 0.0082, + "step": 1412, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019835771378981995, + "lm_loss": 0.0050048828125, + "loss": 0.0077, + "step": 1413, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019835539482398836, + "lm_loss": 0.01116943359375, + "loss": 0.0084, + "step": 1414, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019835307423565554, + "lm_loss": 0.01177978515625, + "loss": 0.0086, + "step": 1415, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019835075202485983, + "lm_loss": 0.004638671875, + "loss": 0.0092, + "step": 1416, + "total_loss": 0.004638671875 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001983484281916395, + "lm_loss": 0.00360107421875, + "loss": 0.0109, + "step": 1417, + "total_loss": 0.00360107421875 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019834610273603294, + "lm_loss": 0.003509521484375, + "loss": 0.009, + "step": 1418, + "total_loss": 0.003509521484375 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019834377565807842, + "lm_loss": 0.005035400390625, + "loss": 0.0101, + "step": 1419, + "total_loss": 0.005035400390625 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001983414469578144, + "lm_loss": 0.00921630859375, + "loss": 0.0097, + "step": 1420, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019833911663527927, + "lm_loss": 0.005615234375, + "loss": 0.0082, + "step": 1421, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019833678469051145, + "lm_loss": 0.0137939453125, + "loss": 0.0104, + "step": 1422, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019833445112354948, + "lm_loss": 0.006500244140625, + "loss": 0.0106, + "step": 1423, + "total_loss": 0.006500244140625 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001983321159344318, + "lm_loss": 0.00677490234375, + "loss": 0.0084, + "step": 1424, + "total_loss": 0.00677490234375 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001983297791231969, + "lm_loss": 0.00567626953125, + "loss": 0.0093, + "step": 1425, + "total_loss": 0.00567626953125 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019832744068988342, + "lm_loss": 0.0096435546875, + "loss": 0.0093, + "step": 1426, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019832510063452983, + "lm_loss": 0.010498046875, + "loss": 0.0091, + "step": 1427, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019832275895717482, + "lm_loss": 0.009033203125, + "loss": 0.0078, + "step": 1428, + "total_loss": 0.009033203125 + }, + { + "epoch": 0.58, + "learning_rate": 0.000198320415657857, + "lm_loss": 0.0120849609375, + "loss": 0.0099, + "step": 1429, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019831807073661498, + "lm_loss": 0.00156402587890625, + "loss": 0.0075, + "step": 1430, + "total_loss": 0.00156402587890625 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019831572419348747, + "lm_loss": 0.010498046875, + "loss": 0.0076, + "step": 1431, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019831337602851324, + "lm_loss": 0.01495361328125, + "loss": 0.0089, + "step": 1432, + "total_loss": 0.01495361328125 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019831102624173094, + "lm_loss": 0.00994873046875, + "loss": 0.0099, + "step": 1433, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001983086748331793, + "lm_loss": 0.005950927734375, + "loss": 0.008, + "step": 1434, + "total_loss": 0.005950927734375 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019830632180289725, + "lm_loss": 0.0078125, + "loss": 0.0099, + "step": 1435, + "total_loss": 0.0078125 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001983039671509235, + "lm_loss": 0.006561279296875, + "loss": 0.0101, + "step": 1436, + "total_loss": 0.006561279296875 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019830161087729692, + "lm_loss": 0.00775146484375, + "loss": 0.0095, + "step": 1437, + "total_loss": 0.00775146484375 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019829925298205637, + "lm_loss": 0.01007080078125, + "loss": 0.01, + "step": 1438, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001982968934652408, + "lm_loss": 0.007293701171875, + "loss": 0.0082, + "step": 1439, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019829453232688907, + "lm_loss": 0.006378173828125, + "loss": 0.0086, + "step": 1440, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019829216956704013, + "lm_loss": 0.012939453125, + "loss": 0.0085, + "step": 1441, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.59, + "learning_rate": 0.000198289805185733, + "lm_loss": 0.0130615234375, + "loss": 0.0096, + "step": 1442, + "total_loss": 0.0130615234375 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019828743918300665, + "lm_loss": 0.006103515625, + "loss": 0.0082, + "step": 1443, + "total_loss": 0.006103515625 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019828507155890014, + "lm_loss": 0.00714111328125, + "loss": 0.0081, + "step": 1444, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001982827023134525, + "lm_loss": 0.00494384765625, + "loss": 0.0085, + "step": 1445, + "total_loss": 0.00494384765625 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019828033144670283, + "lm_loss": 0.0086669921875, + "loss": 0.0098, + "step": 1446, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001982779589586902, + "lm_loss": 0.0067138671875, + "loss": 0.0092, + "step": 1447, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001982755848494538, + "lm_loss": 0.01214599609375, + "loss": 0.0099, + "step": 1448, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019827320911903278, + "lm_loss": 0.004425048828125, + "loss": 0.0087, + "step": 1449, + "total_loss": 0.004425048828125 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019827083176746633, + "lm_loss": 0.01031494140625, + "loss": 0.0105, + "step": 1450, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019826845279479364, + "lm_loss": 0.01177978515625, + "loss": 0.0091, + "step": 1451, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.59, + "learning_rate": 0.000198266072201054, + "lm_loss": 0.01080322265625, + "loss": 0.0083, + "step": 1452, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019826368998628665, + "lm_loss": 0.002685546875, + "loss": 0.01, + "step": 1453, + "total_loss": 0.002685546875 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001982613061505309, + "lm_loss": 0.008544921875, + "loss": 0.0092, + "step": 1454, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019825892069382607, + "lm_loss": 0.01116943359375, + "loss": 0.0092, + "step": 1455, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019825653361621147, + "lm_loss": 0.0157470703125, + "loss": 0.0091, + "step": 1456, + "total_loss": 0.0157470703125 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019825414491772658, + "lm_loss": 0.00469970703125, + "loss": 0.0083, + "step": 1457, + "total_loss": 0.00469970703125 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019825175459841073, + "lm_loss": 0.01300048828125, + "loss": 0.0083, + "step": 1458, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019824936265830335, + "lm_loss": 0.0032806396484375, + "loss": 0.0074, + "step": 1459, + "total_loss": 0.0032806396484375 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001982469690974439, + "lm_loss": 0.01019287109375, + "loss": 0.0082, + "step": 1460, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019824457391587192, + "lm_loss": 0.01123046875, + "loss": 0.0098, + "step": 1461, + "total_loss": 0.01123046875 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019824217711362688, + "lm_loss": 0.01214599609375, + "loss": 0.007, + "step": 1462, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001982397786907483, + "lm_loss": 0.0030670166015625, + "loss": 0.0069, + "step": 1463, + "total_loss": 0.0030670166015625 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019823737864727576, + "lm_loss": 0.007415771484375, + "loss": 0.0087, + "step": 1464, + "total_loss": 0.007415771484375 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019823497698324887, + "lm_loss": 0.005096435546875, + "loss": 0.0099, + "step": 1465, + "total_loss": 0.005096435546875 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019823257369870723, + "lm_loss": 0.00830078125, + "loss": 0.0092, + "step": 1466, + "total_loss": 0.00830078125 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019823016879369048, + "lm_loss": 0.005615234375, + "loss": 0.0098, + "step": 1467, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001982277622682383, + "lm_loss": 0.005401611328125, + "loss": 0.0074, + "step": 1468, + "total_loss": 0.005401611328125 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001982253541223904, + "lm_loss": 0.0146484375, + "loss": 0.0093, + "step": 1469, + "total_loss": 0.0146484375 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001982229443561865, + "lm_loss": 0.0059814453125, + "loss": 0.0095, + "step": 1470, + "total_loss": 0.0059814453125 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019822053296966635, + "lm_loss": 0.007080078125, + "loss": 0.0093, + "step": 1471, + "total_loss": 0.007080078125 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001982181199628697, + "lm_loss": 0.0024871826171875, + "loss": 0.0088, + "step": 1472, + "total_loss": 0.0024871826171875 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001982157053358364, + "lm_loss": 0.012451171875, + "loss": 0.0107, + "step": 1473, + "total_loss": 0.012451171875 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019821328908860628, + "lm_loss": 0.0067138671875, + "loss": 0.0073, + "step": 1474, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019821087122121918, + "lm_loss": 0.010498046875, + "loss": 0.0088, + "step": 1475, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019820845173371498, + "lm_loss": 0.0091552734375, + "loss": 0.0072, + "step": 1476, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001982060306261336, + "lm_loss": 0.006744384765625, + "loss": 0.0082, + "step": 1477, + "total_loss": 0.006744384765625 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019820360789851499, + "lm_loss": 0.01239013671875, + "loss": 0.008, + "step": 1478, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019820118355089915, + "lm_loss": 0.01080322265625, + "loss": 0.0079, + "step": 1479, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019819875758332595, + "lm_loss": 0.0079345703125, + "loss": 0.01, + "step": 1480, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019819632999583555, + "lm_loss": 0.01116943359375, + "loss": 0.0094, + "step": 1481, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019819390078846792, + "lm_loss": 0.00823974609375, + "loss": 0.0098, + "step": 1482, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019819146996126317, + "lm_loss": 0.01165771484375, + "loss": 0.0087, + "step": 1483, + "total_loss": 0.01165771484375 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019818903751426135, + "lm_loss": 0.0076904296875, + "loss": 0.0081, + "step": 1484, + "total_loss": 0.0076904296875 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019818660344750262, + "lm_loss": 0.00170135498046875, + "loss": 0.0098, + "step": 1485, + "total_loss": 0.00170135498046875 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019818416776102714, + "lm_loss": 0.00982666015625, + "loss": 0.009, + "step": 1486, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019818173045487507, + "lm_loss": 0.00445556640625, + "loss": 0.0079, + "step": 1487, + "total_loss": 0.00445556640625 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019817929152908664, + "lm_loss": 0.00506591796875, + "loss": 0.0081, + "step": 1488, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019817685098370204, + "lm_loss": 0.01312255859375, + "loss": 0.0102, + "step": 1489, + "total_loss": 0.01312255859375 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001981744088187616, + "lm_loss": 0.00567626953125, + "loss": 0.0071, + "step": 1490, + "total_loss": 0.00567626953125 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019817196503430556, + "lm_loss": 0.021728515625, + "loss": 0.009, + "step": 1491, + "total_loss": 0.021728515625 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001981695196303742, + "lm_loss": 0.01519775390625, + "loss": 0.0075, + "step": 1492, + "total_loss": 0.01519775390625 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019816707260700793, + "lm_loss": 0.009765625, + "loss": 0.0087, + "step": 1493, + "total_loss": 0.009765625 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019816462396424707, + "lm_loss": 0.01068115234375, + "loss": 0.0111, + "step": 1494, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019816217370213207, + "lm_loss": 0.013427734375, + "loss": 0.0088, + "step": 1495, + "total_loss": 0.013427734375 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019815972182070328, + "lm_loss": 0.003814697265625, + "loss": 0.0099, + "step": 1496, + "total_loss": 0.003814697265625 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019815726832000117, + "lm_loss": 0.004669189453125, + "loss": 0.0102, + "step": 1497, + "total_loss": 0.004669189453125 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001981548132000662, + "lm_loss": 0.00897216796875, + "loss": 0.0091, + "step": 1498, + "total_loss": 0.00897216796875 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019815235646093894, + "lm_loss": 0.00482177734375, + "loss": 0.008, + "step": 1499, + "total_loss": 0.00482177734375 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019814989810265984, + "lm_loss": 0.00933837890625, + "loss": 0.0106, + "step": 1500, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.61, + "eval_lm_loss": 0.010354571975767612, + "eval_loss": 0.010629158467054367, + "eval_runtime": 43.9429, + "eval_samples_per_second": 22.757, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.010354571975767612, + "lm_loss": 0.00171661376953125, + "step": 1500, + "total_loss": 0.00171661376953125 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019814743812526952, + "lm_loss": 0.00811767578125, + "loss": 0.0099, + "step": 1501, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001981449765288085, + "lm_loss": 0.01080322265625, + "loss": 0.0095, + "step": 1502, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001981425133133174, + "lm_loss": 0.0101318359375, + "loss": 0.0084, + "step": 1503, + "total_loss": 0.0101318359375 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019814004847883685, + "lm_loss": 0.0084228515625, + "loss": 0.0069, + "step": 1504, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019813758202540756, + "lm_loss": 0.01385498046875, + "loss": 0.0105, + "step": 1505, + "total_loss": 0.01385498046875 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019813511395307015, + "lm_loss": 0.007537841796875, + "loss": 0.0091, + "step": 1506, + "total_loss": 0.007537841796875 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001981326442618654, + "lm_loss": 0.00799560546875, + "loss": 0.0084, + "step": 1507, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019813017295183398, + "lm_loss": 0.0103759765625, + "loss": 0.0094, + "step": 1508, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019812770002301672, + "lm_loss": 0.0089111328125, + "loss": 0.0089, + "step": 1509, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019812522547545437, + "lm_loss": 0.006927490234375, + "loss": 0.0078, + "step": 1510, + "total_loss": 0.006927490234375 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001981227493091878, + "lm_loss": 0.00433349609375, + "loss": 0.0077, + "step": 1511, + "total_loss": 0.00433349609375 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001981202715242578, + "lm_loss": 0.009765625, + "loss": 0.0095, + "step": 1512, + "total_loss": 0.009765625 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001981177921207053, + "lm_loss": 0.007293701171875, + "loss": 0.0086, + "step": 1513, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019811531109857112, + "lm_loss": 0.00439453125, + "loss": 0.0075, + "step": 1514, + "total_loss": 0.00439453125 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001981128284578963, + "lm_loss": 0.01129150390625, + "loss": 0.0073, + "step": 1515, + "total_loss": 0.01129150390625 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001981103441987217, + "lm_loss": 0.0169677734375, + "loss": 0.0083, + "step": 1516, + "total_loss": 0.0169677734375 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019810785832108832, + "lm_loss": 0.011474609375, + "loss": 0.0102, + "step": 1517, + "total_loss": 0.011474609375 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019810537082503725, + "lm_loss": 0.00457763671875, + "loss": 0.0084, + "step": 1518, + "total_loss": 0.00457763671875 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001981028817106094, + "lm_loss": 0.00665283203125, + "loss": 0.0085, + "step": 1519, + "total_loss": 0.00665283203125 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019810039097784593, + "lm_loss": 0.005859375, + "loss": 0.009, + "step": 1520, + "total_loss": 0.005859375 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019809789862678786, + "lm_loss": 0.006866455078125, + "loss": 0.007, + "step": 1521, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001980954046574763, + "lm_loss": 0.00787353515625, + "loss": 0.0091, + "step": 1522, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019809290906995245, + "lm_loss": 0.00909423828125, + "loss": 0.0086, + "step": 1523, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019809041186425748, + "lm_loss": 0.0062255859375, + "loss": 0.0078, + "step": 1524, + "total_loss": 0.0062255859375 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019808791304043251, + "lm_loss": 0.008544921875, + "loss": 0.0118, + "step": 1525, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019808541259851885, + "lm_loss": 0.010986328125, + "loss": 0.0083, + "step": 1526, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019808291053855767, + "lm_loss": 0.00799560546875, + "loss": 0.0095, + "step": 1527, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019808040686059033, + "lm_loss": 0.00738525390625, + "loss": 0.0083, + "step": 1528, + "total_loss": 0.00738525390625 + }, + { + "epoch": 0.63, + "learning_rate": 0.000198077901564658, + "lm_loss": 0.0042724609375, + "loss": 0.0089, + "step": 1529, + "total_loss": 0.0042724609375 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001980753946508022, + "lm_loss": 0.005706787109375, + "loss": 0.0079, + "step": 1530, + "total_loss": 0.005706787109375 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019807288611906407, + "lm_loss": 0.00823974609375, + "loss": 0.0099, + "step": 1531, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019807037596948518, + "lm_loss": 0.005340576171875, + "loss": 0.0089, + "step": 1532, + "total_loss": 0.005340576171875 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019806786420210678, + "lm_loss": 0.00982666015625, + "loss": 0.0093, + "step": 1533, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019806535081697043, + "lm_loss": 0.006866455078125, + "loss": 0.0071, + "step": 1534, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001980628358141175, + "lm_loss": 0.014892578125, + "loss": 0.0085, + "step": 1535, + "total_loss": 0.014892578125 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019806031919358957, + "lm_loss": 0.0079345703125, + "loss": 0.0074, + "step": 1536, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019805780095542806, + "lm_loss": 0.004241943359375, + "loss": 0.0064, + "step": 1537, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019805528109967458, + "lm_loss": 0.0019989013671875, + "loss": 0.0128, + "step": 1538, + "total_loss": 0.0019989013671875 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019805275962637064, + "lm_loss": 0.009765625, + "loss": 0.0094, + "step": 1539, + "total_loss": 0.009765625 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001980502365355579, + "lm_loss": 0.00958251953125, + "loss": 0.0075, + "step": 1540, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019804771182727792, + "lm_loss": 0.00775146484375, + "loss": 0.0084, + "step": 1541, + "total_loss": 0.00775146484375 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019804518550157243, + "lm_loss": 0.00927734375, + "loss": 0.0085, + "step": 1542, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.63, + "learning_rate": 0.000198042657558483, + "lm_loss": 0.00347900390625, + "loss": 0.0077, + "step": 1543, + "total_loss": 0.00347900390625 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019804012799805142, + "lm_loss": 0.0096435546875, + "loss": 0.0092, + "step": 1544, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001980375968203194, + "lm_loss": 0.006591796875, + "loss": 0.0086, + "step": 1545, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019803506402532866, + "lm_loss": 0.0059814453125, + "loss": 0.0072, + "step": 1546, + "total_loss": 0.0059814453125 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019803252961312097, + "lm_loss": 0.00506591796875, + "loss": 0.009, + "step": 1547, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001980299935837382, + "lm_loss": 0.00946044921875, + "loss": 0.0084, + "step": 1548, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019802745593722217, + "lm_loss": 0.01141357421875, + "loss": 0.0081, + "step": 1549, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001980249166736147, + "lm_loss": 0.007476806640625, + "loss": 0.0075, + "step": 1550, + "total_loss": 0.007476806640625 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019802237579295771, + "lm_loss": 0.006866455078125, + "loss": 0.0087, + "step": 1551, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001980198332952931, + "lm_loss": 0.01025390625, + "loss": 0.0074, + "step": 1552, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019801728918066283, + "lm_loss": 0.005523681640625, + "loss": 0.007, + "step": 1553, + "total_loss": 0.005523681640625 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019801474344910885, + "lm_loss": 0.00811767578125, + "loss": 0.0089, + "step": 1554, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001980121961006732, + "lm_loss": 0.011474609375, + "loss": 0.007, + "step": 1555, + "total_loss": 0.011474609375 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019800964713539783, + "lm_loss": 0.007415771484375, + "loss": 0.008, + "step": 1556, + "total_loss": 0.007415771484375 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019800709655332487, + "lm_loss": 0.010498046875, + "loss": 0.0079, + "step": 1557, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019800454435449632, + "lm_loss": 0.010498046875, + "loss": 0.0094, + "step": 1558, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001980019905389543, + "lm_loss": 0.007354736328125, + "loss": 0.0084, + "step": 1559, + "total_loss": 0.007354736328125 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019799943510674097, + "lm_loss": 0.00567626953125, + "loss": 0.0078, + "step": 1560, + "total_loss": 0.00567626953125 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019799687805789845, + "lm_loss": 0.0107421875, + "loss": 0.0113, + "step": 1561, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019799431939246896, + "lm_loss": 0.0087890625, + "loss": 0.0083, + "step": 1562, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001979917591104947, + "lm_loss": 0.0030059814453125, + "loss": 0.01, + "step": 1563, + "total_loss": 0.0030059814453125 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019798919721201785, + "lm_loss": 0.005859375, + "loss": 0.011, + "step": 1564, + "total_loss": 0.005859375 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019798663369708074, + "lm_loss": 0.004791259765625, + "loss": 0.008, + "step": 1565, + "total_loss": 0.004791259765625 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019798406856572561, + "lm_loss": 0.004791259765625, + "loss": 0.0088, + "step": 1566, + "total_loss": 0.004791259765625 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019798150181799483, + "lm_loss": 0.01025390625, + "loss": 0.0074, + "step": 1567, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019797893345393074, + "lm_loss": 0.00173187255859375, + "loss": 0.0086, + "step": 1568, + "total_loss": 0.00173187255859375 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001979763634735756, + "lm_loss": 0.017333984375, + "loss": 0.0077, + "step": 1569, + "total_loss": 0.017333984375 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019797379187697192, + "lm_loss": 0.012939453125, + "loss": 0.0112, + "step": 1570, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001979712186641621, + "lm_loss": 0.01806640625, + "loss": 0.0104, + "step": 1571, + "total_loss": 0.01806640625 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019796864383518856, + "lm_loss": 0.0098876953125, + "loss": 0.0103, + "step": 1572, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019796606739009377, + "lm_loss": 0.0081787109375, + "loss": 0.0075, + "step": 1573, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019796348932892027, + "lm_loss": 0.009765625, + "loss": 0.0069, + "step": 1574, + "total_loss": 0.009765625 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019796090965171057, + "lm_loss": 0.006317138671875, + "loss": 0.0091, + "step": 1575, + "total_loss": 0.006317138671875 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019795832835850722, + "lm_loss": 0.00335693359375, + "loss": 0.0074, + "step": 1576, + "total_loss": 0.00335693359375 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001979557454493528, + "lm_loss": 0.007080078125, + "loss": 0.0081, + "step": 1577, + "total_loss": 0.007080078125 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019795316092428996, + "lm_loss": 0.0040283203125, + "loss": 0.0091, + "step": 1578, + "total_loss": 0.0040283203125 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019795057478336125, + "lm_loss": 0.00872802734375, + "loss": 0.0086, + "step": 1579, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019794798702660942, + "lm_loss": 0.00799560546875, + "loss": 0.0073, + "step": 1580, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019794539765407708, + "lm_loss": 0.0079345703125, + "loss": 0.0075, + "step": 1581, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.65, + "learning_rate": 0.000197942806665807, + "lm_loss": 0.00848388671875, + "loss": 0.0094, + "step": 1582, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019794021406184195, + "lm_loss": 0.006622314453125, + "loss": 0.0082, + "step": 1583, + "total_loss": 0.006622314453125 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001979376198422246, + "lm_loss": 0.00173187255859375, + "loss": 0.0084, + "step": 1584, + "total_loss": 0.00173187255859375 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019793502400699783, + "lm_loss": 0.00848388671875, + "loss": 0.0082, + "step": 1585, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019793242655620445, + "lm_loss": 0.00921630859375, + "loss": 0.0095, + "step": 1586, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019792982748988728, + "lm_loss": 0.013671875, + "loss": 0.0105, + "step": 1587, + "total_loss": 0.013671875 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019792722680808916, + "lm_loss": 0.00799560546875, + "loss": 0.0072, + "step": 1588, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019792462451085306, + "lm_loss": 0.0125732421875, + "loss": 0.0096, + "step": 1589, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019792202059822192, + "lm_loss": 0.00396728515625, + "loss": 0.008, + "step": 1590, + "total_loss": 0.00396728515625 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019791941507023864, + "lm_loss": 0.005126953125, + "loss": 0.009, + "step": 1591, + "total_loss": 0.005126953125 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019791680792694623, + "lm_loss": 0.01141357421875, + "loss": 0.0091, + "step": 1592, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019791419916838766, + "lm_loss": 0.007568359375, + "loss": 0.0091, + "step": 1593, + "total_loss": 0.007568359375 + }, + { + "epoch": 0.65, + "learning_rate": 0.000197911588794606, + "lm_loss": 0.0081787109375, + "loss": 0.0098, + "step": 1594, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019790897680564432, + "lm_loss": 0.00909423828125, + "loss": 0.0095, + "step": 1595, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001979063632015457, + "lm_loss": 0.0159912109375, + "loss": 0.0085, + "step": 1596, + "total_loss": 0.0159912109375 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019790374798235325, + "lm_loss": 0.00872802734375, + "loss": 0.0085, + "step": 1597, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019790113114811011, + "lm_loss": 0.01031494140625, + "loss": 0.0081, + "step": 1598, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019789851269885945, + "lm_loss": 0.00970458984375, + "loss": 0.0068, + "step": 1599, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019789589263464447, + "lm_loss": 0.0107421875, + "loss": 0.0091, + "step": 1600, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.65, + "eval_lm_loss": 0.00995130930095911, + "eval_loss": 0.010327949188649654, + "eval_runtime": 43.9584, + "eval_samples_per_second": 22.749, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.00995130930095911, + "lm_loss": 0.00160980224609375, + "step": 1600, + "total_loss": 0.00160980224609375 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019789327095550837, + "lm_loss": 0.004638671875, + "loss": 0.0076, + "step": 1601, + "total_loss": 0.004638671875 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001978906476614944, + "lm_loss": 0.0111083984375, + "loss": 0.0077, + "step": 1602, + "total_loss": 0.0111083984375 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001978880227526459, + "lm_loss": 0.0107421875, + "loss": 0.0097, + "step": 1603, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019788539622900606, + "lm_loss": 0.022216796875, + "loss": 0.0093, + "step": 1604, + "total_loss": 0.022216796875 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001978827680906183, + "lm_loss": 0.01239013671875, + "loss": 0.009, + "step": 1605, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019788013833752595, + "lm_loss": 0.01068115234375, + "loss": 0.0072, + "step": 1606, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019787750696977234, + "lm_loss": 0.0079345703125, + "loss": 0.0079, + "step": 1607, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019787487398740096, + "lm_loss": 0.00506591796875, + "loss": 0.0083, + "step": 1608, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019787223939045522, + "lm_loss": 0.0023040771484375, + "loss": 0.0089, + "step": 1609, + "total_loss": 0.0023040771484375 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019786960317897854, + "lm_loss": 0.00872802734375, + "loss": 0.0119, + "step": 1610, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019786696535301442, + "lm_loss": 0.006622314453125, + "loss": 0.0119, + "step": 1611, + "total_loss": 0.006622314453125 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001978643259126064, + "lm_loss": 0.00469970703125, + "loss": 0.0069, + "step": 1612, + "total_loss": 0.00469970703125 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019786168485779802, + "lm_loss": 0.01007080078125, + "loss": 0.0073, + "step": 1613, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001978590421886328, + "lm_loss": 0.00750732421875, + "loss": 0.0103, + "step": 1614, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019785639790515443, + "lm_loss": 0.01385498046875, + "loss": 0.0097, + "step": 1615, + "total_loss": 0.01385498046875 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019785375200740646, + "lm_loss": 0.01019287109375, + "loss": 0.0095, + "step": 1616, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019785110449543254, + "lm_loss": 0.004852294921875, + "loss": 0.0093, + "step": 1617, + "total_loss": 0.004852294921875 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019784845536927636, + "lm_loss": 0.00634765625, + "loss": 0.0092, + "step": 1618, + "total_loss": 0.00634765625 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001978458046289816, + "lm_loss": 0.009521484375, + "loss": 0.0073, + "step": 1619, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.66, + "learning_rate": 0.000197843152274592, + "lm_loss": 0.01251220703125, + "loss": 0.0073, + "step": 1620, + "total_loss": 0.01251220703125 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019784049830615135, + "lm_loss": 0.01312255859375, + "loss": 0.0093, + "step": 1621, + "total_loss": 0.01312255859375 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019783784272370337, + "lm_loss": 0.005767822265625, + "loss": 0.008, + "step": 1622, + "total_loss": 0.005767822265625 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019783518552729193, + "lm_loss": 0.006256103515625, + "loss": 0.0086, + "step": 1623, + "total_loss": 0.006256103515625 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019783252671696076, + "lm_loss": 0.01214599609375, + "loss": 0.01, + "step": 1624, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019782986629275385, + "lm_loss": 0.0087890625, + "loss": 0.0102, + "step": 1625, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.66, + "learning_rate": 0.000197827204254715, + "lm_loss": 0.00469970703125, + "loss": 0.0083, + "step": 1626, + "total_loss": 0.00469970703125 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019782454060288815, + "lm_loss": 0.00933837890625, + "loss": 0.0063, + "step": 1627, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019782187533731727, + "lm_loss": 0.007720947265625, + "loss": 0.008, + "step": 1628, + "total_loss": 0.007720947265625 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019781920845804627, + "lm_loss": 0.0068359375, + "loss": 0.0107, + "step": 1629, + "total_loss": 0.0068359375 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019781653996511919, + "lm_loss": 0.003448486328125, + "loss": 0.0088, + "step": 1630, + "total_loss": 0.003448486328125 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019781386985858, + "lm_loss": 0.0108642578125, + "loss": 0.0079, + "step": 1631, + "total_loss": 0.0108642578125 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001978111981384728, + "lm_loss": 0.01220703125, + "loss": 0.0098, + "step": 1632, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019780852480484165, + "lm_loss": 0.00555419921875, + "loss": 0.0066, + "step": 1633, + "total_loss": 0.00555419921875 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019780584985773064, + "lm_loss": 0.004150390625, + "loss": 0.0098, + "step": 1634, + "total_loss": 0.004150390625 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019780317329718387, + "lm_loss": 0.005126953125, + "loss": 0.0082, + "step": 1635, + "total_loss": 0.005126953125 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019780049512324555, + "lm_loss": 0.007568359375, + "loss": 0.0075, + "step": 1636, + "total_loss": 0.007568359375 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019779781533595982, + "lm_loss": 0.01055908203125, + "loss": 0.0082, + "step": 1637, + "total_loss": 0.01055908203125 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019779513393537092, + "lm_loss": 0.00823974609375, + "loss": 0.0072, + "step": 1638, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019779245092152304, + "lm_loss": 0.00457763671875, + "loss": 0.0101, + "step": 1639, + "total_loss": 0.00457763671875 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019778976629446048, + "lm_loss": 0.0042724609375, + "loss": 0.0079, + "step": 1640, + "total_loss": 0.0042724609375 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001977870800542275, + "lm_loss": 0.00799560546875, + "loss": 0.0089, + "step": 1641, + "total_loss": 0.00799560546875 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001977843922008684, + "lm_loss": 0.007354736328125, + "loss": 0.0087, + "step": 1642, + "total_loss": 0.007354736328125 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019778170273442757, + "lm_loss": 0.01025390625, + "loss": 0.0075, + "step": 1643, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019777901165494934, + "lm_loss": 0.01177978515625, + "loss": 0.0091, + "step": 1644, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019777631896247813, + "lm_loss": 0.00174713134765625, + "loss": 0.0072, + "step": 1645, + "total_loss": 0.00174713134765625 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001977736246570583, + "lm_loss": 0.004974365234375, + "loss": 0.0094, + "step": 1646, + "total_loss": 0.004974365234375 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019777092873873438, + "lm_loss": 0.0093994140625, + "loss": 0.0106, + "step": 1647, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019776823120755077, + "lm_loss": 0.00787353515625, + "loss": 0.0081, + "step": 1648, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.67, + "learning_rate": 0.000197765532063552, + "lm_loss": 0.0022125244140625, + "loss": 0.0087, + "step": 1649, + "total_loss": 0.0022125244140625 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019776283130678262, + "lm_loss": 0.01031494140625, + "loss": 0.0085, + "step": 1650, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.67, + "learning_rate": 0.0001977601289372871, + "lm_loss": 0.0093994140625, + "loss": 0.0071, + "step": 1651, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019775742495511015, + "lm_loss": 0.01422119140625, + "loss": 0.0086, + "step": 1652, + "total_loss": 0.01422119140625 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019775471936029623, + "lm_loss": 0.001617431640625, + "loss": 0.0077, + "step": 1653, + "total_loss": 0.001617431640625 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019775201215289006, + "lm_loss": 0.0089111328125, + "loss": 0.009, + "step": 1654, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019774930333293632, + "lm_loss": 0.0045166015625, + "loss": 0.0073, + "step": 1655, + "total_loss": 0.0045166015625 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019774659290047963, + "lm_loss": 0.005889892578125, + "loss": 0.0081, + "step": 1656, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019774388085556474, + "lm_loss": 0.00921630859375, + "loss": 0.0097, + "step": 1657, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019774116719823638, + "lm_loss": 0.00537109375, + "loss": 0.0082, + "step": 1658, + "total_loss": 0.00537109375 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019773845192853933, + "lm_loss": 0.013427734375, + "loss": 0.0096, + "step": 1659, + "total_loss": 0.013427734375 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019773573504651832, + "lm_loss": 0.005706787109375, + "loss": 0.0096, + "step": 1660, + "total_loss": 0.005706787109375 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019773301655221824, + "lm_loss": 0.0147705078125, + "loss": 0.0098, + "step": 1661, + "total_loss": 0.0147705078125 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019773029644568394, + "lm_loss": 0.0169677734375, + "loss": 0.0112, + "step": 1662, + "total_loss": 0.0169677734375 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019772757472696022, + "lm_loss": 0.010009765625, + "loss": 0.0088, + "step": 1663, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019772485139609203, + "lm_loss": 0.00689697265625, + "loss": 0.009, + "step": 1664, + "total_loss": 0.00689697265625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001977221264531243, + "lm_loss": 0.0081787109375, + "loss": 0.0081, + "step": 1665, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019771939989810195, + "lm_loss": 0.00579833984375, + "loss": 0.0078, + "step": 1666, + "total_loss": 0.00579833984375 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019771667173106998, + "lm_loss": 0.00787353515625, + "loss": 0.0066, + "step": 1667, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001977139419520734, + "lm_loss": 0.01123046875, + "loss": 0.0078, + "step": 1668, + "total_loss": 0.01123046875 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001977112105611572, + "lm_loss": 0.00634765625, + "loss": 0.0072, + "step": 1669, + "total_loss": 0.00634765625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001977084775583665, + "lm_loss": 0.010009765625, + "loss": 0.0083, + "step": 1670, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019770574294374632, + "lm_loss": 0.00531005859375, + "loss": 0.0088, + "step": 1671, + "total_loss": 0.00531005859375 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019770300671734185, + "lm_loss": 0.0211181640625, + "loss": 0.0086, + "step": 1672, + "total_loss": 0.0211181640625 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019770026887919816, + "lm_loss": 0.0216064453125, + "loss": 0.0118, + "step": 1673, + "total_loss": 0.0216064453125 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019769752942936042, + "lm_loss": 0.0115966796875, + "loss": 0.0113, + "step": 1674, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019769478836787385, + "lm_loss": 0.01043701171875, + "loss": 0.0084, + "step": 1675, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019769204569478367, + "lm_loss": 0.0052490234375, + "loss": 0.0096, + "step": 1676, + "total_loss": 0.0052490234375 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019768930141013507, + "lm_loss": 0.0126953125, + "loss": 0.0099, + "step": 1677, + "total_loss": 0.0126953125 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976865555139734, + "lm_loss": 0.0068359375, + "loss": 0.0076, + "step": 1678, + "total_loss": 0.0068359375 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976838080063439, + "lm_loss": 0.007598876953125, + "loss": 0.0098, + "step": 1679, + "total_loss": 0.007598876953125 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976810588872919, + "lm_loss": 0.005950927734375, + "loss": 0.0085, + "step": 1680, + "total_loss": 0.005950927734375 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019767830815686278, + "lm_loss": 0.0086669921875, + "loss": 0.0065, + "step": 1681, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019767555581510186, + "lm_loss": 0.01416015625, + "loss": 0.0109, + "step": 1682, + "total_loss": 0.01416015625 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019767280186205462, + "lm_loss": 0.0137939453125, + "loss": 0.0088, + "step": 1683, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976700462977664, + "lm_loss": 0.00787353515625, + "loss": 0.0088, + "step": 1684, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019766728912228273, + "lm_loss": 0.0103759765625, + "loss": 0.0072, + "step": 1685, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976645303356491, + "lm_loss": 0.00140380859375, + "loss": 0.0112, + "step": 1686, + "total_loss": 0.00140380859375 + }, + { + "epoch": 0.69, + "learning_rate": 0.000197661769937911, + "lm_loss": 0.00848388671875, + "loss": 0.0067, + "step": 1687, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976590079291139, + "lm_loss": 0.00482177734375, + "loss": 0.0087, + "step": 1688, + "total_loss": 0.00482177734375 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019765624430930344, + "lm_loss": 0.004058837890625, + "loss": 0.0075, + "step": 1689, + "total_loss": 0.004058837890625 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976534790785252, + "lm_loss": 0.00677490234375, + "loss": 0.0082, + "step": 1690, + "total_loss": 0.00677490234375 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019765071223682478, + "lm_loss": 0.005950927734375, + "loss": 0.0069, + "step": 1691, + "total_loss": 0.005950927734375 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019764794378424782, + "lm_loss": 0.00994873046875, + "loss": 0.0087, + "step": 1692, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019764517372084002, + "lm_loss": 0.00872802734375, + "loss": 0.0068, + "step": 1693, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019764240204664702, + "lm_loss": 0.00970458984375, + "loss": 0.0086, + "step": 1694, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976396287617146, + "lm_loss": 0.003814697265625, + "loss": 0.0089, + "step": 1695, + "total_loss": 0.003814697265625 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976368538660885, + "lm_loss": 0.01226806640625, + "loss": 0.01, + "step": 1696, + "total_loss": 0.01226806640625 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019763407735981444, + "lm_loss": 0.005859375, + "loss": 0.0082, + "step": 1697, + "total_loss": 0.005859375 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001976312992429383, + "lm_loss": 0.00830078125, + "loss": 0.0103, + "step": 1698, + "total_loss": 0.00830078125 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019762851951550582, + "lm_loss": 0.0057373046875, + "loss": 0.0078, + "step": 1699, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019762573817756293, + "lm_loss": 0.0115966796875, + "loss": 0.0094, + "step": 1700, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.7, + "eval_lm_loss": 0.01012678537517786, + "eval_loss": 0.01052780169993639, + "eval_runtime": 44.5508, + "eval_samples_per_second": 22.446, + "eval_steps_per_second": 0.202, + "eval_total_loss": 0.01012678537517786, + "lm_loss": 0.00080108642578125, + "step": 1700, + "total_loss": 0.00080108642578125 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019762295522915549, + "lm_loss": 0.00909423828125, + "loss": 0.0085, + "step": 1701, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019762017067032943, + "lm_loss": 0.00396728515625, + "loss": 0.0063, + "step": 1702, + "total_loss": 0.00396728515625 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019761738450113062, + "lm_loss": 0.0118408203125, + "loss": 0.0082, + "step": 1703, + "total_loss": 0.0118408203125 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019761459672160508, + "lm_loss": 0.006439208984375, + "loss": 0.0086, + "step": 1704, + "total_loss": 0.006439208984375 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001976118073317988, + "lm_loss": 0.009033203125, + "loss": 0.0085, + "step": 1705, + "total_loss": 0.009033203125 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019760901633175774, + "lm_loss": 0.0115966796875, + "loss": 0.0092, + "step": 1706, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.7, + "learning_rate": 0.000197606223721528, + "lm_loss": 0.00982666015625, + "loss": 0.0074, + "step": 1707, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019760342950115561, + "lm_loss": 0.01373291015625, + "loss": 0.0092, + "step": 1708, + "total_loss": 0.01373291015625 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019760063367068667, + "lm_loss": 0.0057373046875, + "loss": 0.0078, + "step": 1709, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019759783623016732, + "lm_loss": 0.002593994140625, + "loss": 0.0061, + "step": 1710, + "total_loss": 0.002593994140625 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001975950371796437, + "lm_loss": 0.00921630859375, + "loss": 0.0084, + "step": 1711, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.7, + "learning_rate": 0.000197592236519162, + "lm_loss": 0.0120849609375, + "loss": 0.0089, + "step": 1712, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001975894342487684, + "lm_loss": 0.00518798828125, + "loss": 0.0067, + "step": 1713, + "total_loss": 0.00518798828125 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019758663036850908, + "lm_loss": 0.004974365234375, + "loss": 0.0085, + "step": 1714, + "total_loss": 0.004974365234375 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019758382487843038, + "lm_loss": 0.0096435546875, + "loss": 0.0071, + "step": 1715, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019758101777857853, + "lm_loss": 0.012939453125, + "loss": 0.009, + "step": 1716, + "total_loss": 0.012939453125 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019757820906899986, + "lm_loss": 0.01116943359375, + "loss": 0.0102, + "step": 1717, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019757539874974068, + "lm_loss": 0.00714111328125, + "loss": 0.0094, + "step": 1718, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001975725868208474, + "lm_loss": 0.00537109375, + "loss": 0.0086, + "step": 1719, + "total_loss": 0.00537109375 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001975697732823663, + "lm_loss": 0.01220703125, + "loss": 0.01, + "step": 1720, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001975669581343439, + "lm_loss": 0.00360107421875, + "loss": 0.0091, + "step": 1721, + "total_loss": 0.00360107421875 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019756414137682665, + "lm_loss": 0.010986328125, + "loss": 0.0103, + "step": 1722, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001975613230098609, + "lm_loss": 0.00732421875, + "loss": 0.01, + "step": 1723, + "total_loss": 0.00732421875 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019755850303349322, + "lm_loss": 0.021728515625, + "loss": 0.0082, + "step": 1724, + "total_loss": 0.021728515625 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019755568144777015, + "lm_loss": 0.006683349609375, + "loss": 0.0078, + "step": 1725, + "total_loss": 0.006683349609375 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019755285825273817, + "lm_loss": 0.0087890625, + "loss": 0.007, + "step": 1726, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001975500334484439, + "lm_loss": 0.01287841796875, + "loss": 0.0091, + "step": 1727, + "total_loss": 0.01287841796875 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001975472070349339, + "lm_loss": 0.004241943359375, + "loss": 0.0067, + "step": 1728, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019754437901225487, + "lm_loss": 0.0040283203125, + "loss": 0.0083, + "step": 1729, + "total_loss": 0.0040283203125 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001975415493804534, + "lm_loss": 0.0059814453125, + "loss": 0.0085, + "step": 1730, + "total_loss": 0.0059814453125 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019753871813957616, + "lm_loss": 0.007720947265625, + "loss": 0.0094, + "step": 1731, + "total_loss": 0.007720947265625 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001975358852896699, + "lm_loss": 0.007476806640625, + "loss": 0.0077, + "step": 1732, + "total_loss": 0.007476806640625 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019753305083078132, + "lm_loss": 0.01116943359375, + "loss": 0.0086, + "step": 1733, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001975302147629572, + "lm_loss": 0.0064697265625, + "loss": 0.0071, + "step": 1734, + "total_loss": 0.0064697265625 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001975273770862443, + "lm_loss": 0.01129150390625, + "loss": 0.0108, + "step": 1735, + "total_loss": 0.01129150390625 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019752453780068945, + "lm_loss": 0.0069580078125, + "loss": 0.0093, + "step": 1736, + "total_loss": 0.0069580078125 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019752169690633945, + "lm_loss": 0.005889892578125, + "loss": 0.0062, + "step": 1737, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019751885440324123, + "lm_loss": 0.0133056640625, + "loss": 0.0099, + "step": 1738, + "total_loss": 0.0133056640625 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019751601029144162, + "lm_loss": 0.004180908203125, + "loss": 0.0095, + "step": 1739, + "total_loss": 0.004180908203125 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019751316457098755, + "lm_loss": 0.0089111328125, + "loss": 0.0108, + "step": 1740, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019751031724192603, + "lm_loss": 0.005218505859375, + "loss": 0.0088, + "step": 1741, + "total_loss": 0.005218505859375 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019750746830430395, + "lm_loss": 0.006378173828125, + "loss": 0.0081, + "step": 1742, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019750461775816833, + "lm_loss": 0.01025390625, + "loss": 0.0077, + "step": 1743, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001975017656035662, + "lm_loss": 0.005401611328125, + "loss": 0.0099, + "step": 1744, + "total_loss": 0.005401611328125 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001974989118405446, + "lm_loss": 0.006195068359375, + "loss": 0.0085, + "step": 1745, + "total_loss": 0.006195068359375 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019749605646915064, + "lm_loss": 0.0034332275390625, + "loss": 0.0069, + "step": 1746, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001974931994894314, + "lm_loss": 0.006591796875, + "loss": 0.0079, + "step": 1747, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019749034090143396, + "lm_loss": 0.0050048828125, + "loss": 0.0069, + "step": 1748, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019748748070520556, + "lm_loss": 0.005523681640625, + "loss": 0.0078, + "step": 1749, + "total_loss": 0.005523681640625 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019748461890079332, + "lm_loss": 0.01513671875, + "loss": 0.01, + "step": 1750, + "total_loss": 0.01513671875 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974817554882445, + "lm_loss": 0.00982666015625, + "loss": 0.0101, + "step": 1751, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974788904676063, + "lm_loss": 0.01092529296875, + "loss": 0.0077, + "step": 1752, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019747602383892598, + "lm_loss": 0.01007080078125, + "loss": 0.0103, + "step": 1753, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019747315560225088, + "lm_loss": 0.0018768310546875, + "loss": 0.0085, + "step": 1754, + "total_loss": 0.0018768310546875 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019747028575762824, + "lm_loss": 0.0084228515625, + "loss": 0.0096, + "step": 1755, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019746741430510544, + "lm_loss": 0.0106201171875, + "loss": 0.009, + "step": 1756, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974645412447299, + "lm_loss": 0.00396728515625, + "loss": 0.0101, + "step": 1757, + "total_loss": 0.00396728515625 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974616665765489, + "lm_loss": 0.0106201171875, + "loss": 0.0072, + "step": 1758, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019745879030060994, + "lm_loss": 0.007049560546875, + "loss": 0.0086, + "step": 1759, + "total_loss": 0.007049560546875 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019745591241696045, + "lm_loss": 0.005462646484375, + "loss": 0.0079, + "step": 1760, + "total_loss": 0.005462646484375 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974530329256479, + "lm_loss": 0.0091552734375, + "loss": 0.0095, + "step": 1761, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019745015182671985, + "lm_loss": 0.0146484375, + "loss": 0.01, + "step": 1762, + "total_loss": 0.0146484375 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019744726912022373, + "lm_loss": 0.003570556640625, + "loss": 0.0084, + "step": 1763, + "total_loss": 0.003570556640625 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019744438480620712, + "lm_loss": 0.0098876953125, + "loss": 0.0099, + "step": 1764, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019744149888471766, + "lm_loss": 0.01031494140625, + "loss": 0.0096, + "step": 1765, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974386113558029, + "lm_loss": 0.010498046875, + "loss": 0.0086, + "step": 1766, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974357222195105, + "lm_loss": 0.01385498046875, + "loss": 0.0087, + "step": 1767, + "total_loss": 0.01385498046875 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974328314758881, + "lm_loss": 0.00616455078125, + "loss": 0.0068, + "step": 1768, + "total_loss": 0.00616455078125 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019742993912498337, + "lm_loss": 0.00439453125, + "loss": 0.0076, + "step": 1769, + "total_loss": 0.00439453125 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974270451668441, + "lm_loss": 0.006378173828125, + "loss": 0.0076, + "step": 1770, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019742414960151794, + "lm_loss": 0.00897216796875, + "loss": 0.0087, + "step": 1771, + "total_loss": 0.00897216796875 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974212524290527, + "lm_loss": 0.00677490234375, + "loss": 0.0073, + "step": 1772, + "total_loss": 0.00677490234375 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001974183536494962, + "lm_loss": 0.00634765625, + "loss": 0.0082, + "step": 1773, + "total_loss": 0.00634765625 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019741545326289618, + "lm_loss": 0.010498046875, + "loss": 0.0078, + "step": 1774, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019741255126930058, + "lm_loss": 0.00750732421875, + "loss": 0.0093, + "step": 1775, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001974096476687572, + "lm_loss": 0.0115966796875, + "loss": 0.0101, + "step": 1776, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019740674246131396, + "lm_loss": 0.0078125, + "loss": 0.0087, + "step": 1777, + "total_loss": 0.0078125 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001974038356470188, + "lm_loss": 0.004638671875, + "loss": 0.0068, + "step": 1778, + "total_loss": 0.004638671875 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019740092722591966, + "lm_loss": 0.0120849609375, + "loss": 0.0088, + "step": 1779, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001973980171980645, + "lm_loss": 0.0137939453125, + "loss": 0.0096, + "step": 1780, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019739510556350136, + "lm_loss": 0.007415771484375, + "loss": 0.0069, + "step": 1781, + "total_loss": 0.007415771484375 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019739219232227826, + "lm_loss": 0.01080322265625, + "loss": 0.008, + "step": 1782, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019738927747444323, + "lm_loss": 0.01226806640625, + "loss": 0.0082, + "step": 1783, + "total_loss": 0.01226806640625 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019738636102004442, + "lm_loss": 0.00506591796875, + "loss": 0.0097, + "step": 1784, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019738344295912988, + "lm_loss": 0.00653076171875, + "loss": 0.0089, + "step": 1785, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019738052329174775, + "lm_loss": 0.00732421875, + "loss": 0.01, + "step": 1786, + "total_loss": 0.00732421875 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019737760201794623, + "lm_loss": 0.004119873046875, + "loss": 0.0065, + "step": 1787, + "total_loss": 0.004119873046875 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019737467913777347, + "lm_loss": 0.007293701171875, + "loss": 0.007, + "step": 1788, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001973717546512777, + "lm_loss": 0.007049560546875, + "loss": 0.0077, + "step": 1789, + "total_loss": 0.007049560546875 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001973688285585072, + "lm_loss": 0.006927490234375, + "loss": 0.0074, + "step": 1790, + "total_loss": 0.006927490234375 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019736590085951016, + "lm_loss": 0.01263427734375, + "loss": 0.0088, + "step": 1791, + "total_loss": 0.01263427734375 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019736297155433495, + "lm_loss": 0.0078125, + "loss": 0.0086, + "step": 1792, + "total_loss": 0.0078125 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019736004064302987, + "lm_loss": 0.004241943359375, + "loss": 0.0075, + "step": 1793, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019735710812564326, + "lm_loss": 0.01397705078125, + "loss": 0.0086, + "step": 1794, + "total_loss": 0.01397705078125 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019735417400222353, + "lm_loss": 0.0035858154296875, + "loss": 0.0096, + "step": 1795, + "total_loss": 0.0035858154296875 + }, + { + "epoch": 0.73, + "learning_rate": 0.000197351238272819, + "lm_loss": 0.006988525390625, + "loss": 0.0074, + "step": 1796, + "total_loss": 0.006988525390625 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001973483009374782, + "lm_loss": 0.011962890625, + "loss": 0.0093, + "step": 1797, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019734536199624955, + "lm_loss": 0.01190185546875, + "loss": 0.0091, + "step": 1798, + "total_loss": 0.01190185546875 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019734242144918147, + "lm_loss": 0.0025634765625, + "loss": 0.0086, + "step": 1799, + "total_loss": 0.0025634765625 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019733947929632254, + "lm_loss": 0.0185546875, + "loss": 0.0096, + "step": 1800, + "total_loss": 0.0185546875 + }, + { + "epoch": 0.74, + "eval_lm_loss": 0.010203767567873001, + "eval_loss": 0.010550899431109428, + "eval_runtime": 43.9434, + "eval_samples_per_second": 22.757, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.010203767567873001, + "lm_loss": 0.00128936767578125, + "step": 1800, + "total_loss": 0.00128936767578125 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019733653553772126, + "lm_loss": 0.010009765625, + "loss": 0.0092, + "step": 1801, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019733359017342622, + "lm_loss": 0.01483154296875, + "loss": 0.0092, + "step": 1802, + "total_loss": 0.01483154296875 + }, + { + "epoch": 0.74, + "learning_rate": 0.000197330643203486, + "lm_loss": 0.00946044921875, + "loss": 0.0069, + "step": 1803, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001973276946279492, + "lm_loss": 0.01007080078125, + "loss": 0.0091, + "step": 1804, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019732474444686447, + "lm_loss": 0.008056640625, + "loss": 0.0083, + "step": 1805, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019732179266028046, + "lm_loss": 0.006744384765625, + "loss": 0.0083, + "step": 1806, + "total_loss": 0.006744384765625 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001973188392682459, + "lm_loss": 0.00787353515625, + "loss": 0.0091, + "step": 1807, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019731588427080948, + "lm_loss": 0.00634765625, + "loss": 0.0059, + "step": 1808, + "total_loss": 0.00634765625 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019731292766801995, + "lm_loss": 0.005889892578125, + "loss": 0.006, + "step": 1809, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019730996945992605, + "lm_loss": 0.00677490234375, + "loss": 0.0084, + "step": 1810, + "total_loss": 0.00677490234375 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019730700964657664, + "lm_loss": 0.00823974609375, + "loss": 0.009, + "step": 1811, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019730404822802054, + "lm_loss": 0.0086669921875, + "loss": 0.0088, + "step": 1812, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001973010852043066, + "lm_loss": 0.003082275390625, + "loss": 0.0088, + "step": 1813, + "total_loss": 0.003082275390625 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019729812057548367, + "lm_loss": 0.006103515625, + "loss": 0.008, + "step": 1814, + "total_loss": 0.006103515625 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019729515434160064, + "lm_loss": 0.004791259765625, + "loss": 0.0088, + "step": 1815, + "total_loss": 0.004791259765625 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019729218650270652, + "lm_loss": 0.006378173828125, + "loss": 0.0075, + "step": 1816, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019728921705885018, + "lm_loss": 0.007537841796875, + "loss": 0.0079, + "step": 1817, + "total_loss": 0.007537841796875 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019728624601008066, + "lm_loss": 0.005340576171875, + "loss": 0.0082, + "step": 1818, + "total_loss": 0.005340576171875 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019728327335644694, + "lm_loss": 0.0135498046875, + "loss": 0.0086, + "step": 1819, + "total_loss": 0.0135498046875 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001972802990979981, + "lm_loss": 0.006134033203125, + "loss": 0.0094, + "step": 1820, + "total_loss": 0.006134033203125 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019727732323478314, + "lm_loss": 0.01025390625, + "loss": 0.0093, + "step": 1821, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001972743457668512, + "lm_loss": 0.00787353515625, + "loss": 0.0088, + "step": 1822, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019727136669425144, + "lm_loss": 0.007171630859375, + "loss": 0.009, + "step": 1823, + "total_loss": 0.007171630859375 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019726838601703288, + "lm_loss": 0.01239013671875, + "loss": 0.0079, + "step": 1824, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001972654037352448, + "lm_loss": 0.005523681640625, + "loss": 0.0083, + "step": 1825, + "total_loss": 0.005523681640625 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019726241984893635, + "lm_loss": 0.01287841796875, + "loss": 0.0104, + "step": 1826, + "total_loss": 0.01287841796875 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019725943435815676, + "lm_loss": 0.00970458984375, + "loss": 0.0076, + "step": 1827, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019725644726295528, + "lm_loss": 0.01220703125, + "loss": 0.0084, + "step": 1828, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019725345856338119, + "lm_loss": 0.0098876953125, + "loss": 0.0091, + "step": 1829, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019725046825948376, + "lm_loss": 0.0064697265625, + "loss": 0.0078, + "step": 1830, + "total_loss": 0.0064697265625 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019724747635131235, + "lm_loss": 0.003875732421875, + "loss": 0.0087, + "step": 1831, + "total_loss": 0.003875732421875 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019724448283891635, + "lm_loss": 0.01031494140625, + "loss": 0.0078, + "step": 1832, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019724148772234509, + "lm_loss": 0.006256103515625, + "loss": 0.0084, + "step": 1833, + "total_loss": 0.006256103515625 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019723849100164795, + "lm_loss": 0.0050048828125, + "loss": 0.0077, + "step": 1834, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019723549267687445, + "lm_loss": 0.006591796875, + "loss": 0.0088, + "step": 1835, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019723249274807402, + "lm_loss": 0.00421142578125, + "loss": 0.0091, + "step": 1836, + "total_loss": 0.00421142578125 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001972294912152961, + "lm_loss": 0.00946044921875, + "loss": 0.0086, + "step": 1837, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019722648807859026, + "lm_loss": 0.006500244140625, + "loss": 0.0072, + "step": 1838, + "total_loss": 0.006500244140625 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019722348333800602, + "lm_loss": 0.006439208984375, + "loss": 0.0079, + "step": 1839, + "total_loss": 0.006439208984375 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019722047699359294, + "lm_loss": 0.004730224609375, + "loss": 0.0082, + "step": 1840, + "total_loss": 0.004730224609375 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019721746904540063, + "lm_loss": 0.005859375, + "loss": 0.0095, + "step": 1841, + "total_loss": 0.005859375 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001972144594934787, + "lm_loss": 0.0020294189453125, + "loss": 0.0072, + "step": 1842, + "total_loss": 0.0020294189453125 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001972114483378768, + "lm_loss": 0.0107421875, + "loss": 0.0087, + "step": 1843, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019720843557864462, + "lm_loss": 0.0169677734375, + "loss": 0.0087, + "step": 1844, + "total_loss": 0.0169677734375 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019720542121583182, + "lm_loss": 0.006103515625, + "loss": 0.0079, + "step": 1845, + "total_loss": 0.006103515625 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019720240524948816, + "lm_loss": 0.01318359375, + "loss": 0.0087, + "step": 1846, + "total_loss": 0.01318359375 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019719938767966336, + "lm_loss": 0.006683349609375, + "loss": 0.0068, + "step": 1847, + "total_loss": 0.006683349609375 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001971963685064072, + "lm_loss": 0.01239013671875, + "loss": 0.0094, + "step": 1848, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019719334772976956, + "lm_loss": 0.01190185546875, + "loss": 0.0076, + "step": 1849, + "total_loss": 0.01190185546875 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019719032534980016, + "lm_loss": 0.00616455078125, + "loss": 0.0093, + "step": 1850, + "total_loss": 0.00616455078125 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019718730136654894, + "lm_loss": 0.0128173828125, + "loss": 0.0097, + "step": 1851, + "total_loss": 0.0128173828125 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019718427578006576, + "lm_loss": 0.00347900390625, + "loss": 0.007, + "step": 1852, + "total_loss": 0.00347900390625 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019718124859040054, + "lm_loss": 0.0159912109375, + "loss": 0.0091, + "step": 1853, + "total_loss": 0.0159912109375 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001971782197976032, + "lm_loss": 0.0032196044921875, + "loss": 0.0077, + "step": 1854, + "total_loss": 0.0032196044921875 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019717518940172367, + "lm_loss": 0.00994873046875, + "loss": 0.0077, + "step": 1855, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019717215740281202, + "lm_loss": 0.01434326171875, + "loss": 0.0085, + "step": 1856, + "total_loss": 0.01434326171875 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019716912380091822, + "lm_loss": 0.0034637451171875, + "loss": 0.0072, + "step": 1857, + "total_loss": 0.0034637451171875 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019716608859609232, + "lm_loss": 0.01129150390625, + "loss": 0.007, + "step": 1858, + "total_loss": 0.01129150390625 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019716305178838435, + "lm_loss": 0.004058837890625, + "loss": 0.0096, + "step": 1859, + "total_loss": 0.004058837890625 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001971600133778445, + "lm_loss": 0.01361083984375, + "loss": 0.0082, + "step": 1860, + "total_loss": 0.01361083984375 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019715697336452283, + "lm_loss": 0.00836181640625, + "loss": 0.0065, + "step": 1861, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001971539317484695, + "lm_loss": 0.01068115234375, + "loss": 0.0077, + "step": 1862, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019715088852973467, + "lm_loss": 0.01116943359375, + "loss": 0.0073, + "step": 1863, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019714784370836855, + "lm_loss": 0.0033721923828125, + "loss": 0.0076, + "step": 1864, + "total_loss": 0.0033721923828125 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019714479728442138, + "lm_loss": 0.00714111328125, + "loss": 0.0081, + "step": 1865, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001971417492579434, + "lm_loss": 0.007232666015625, + "loss": 0.0057, + "step": 1866, + "total_loss": 0.007232666015625 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019713869962898492, + "lm_loss": 0.00469970703125, + "loss": 0.0094, + "step": 1867, + "total_loss": 0.00469970703125 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019713564839759622, + "lm_loss": 0.005645751953125, + "loss": 0.0115, + "step": 1868, + "total_loss": 0.005645751953125 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019713259556382765, + "lm_loss": 0.005126953125, + "loss": 0.0079, + "step": 1869, + "total_loss": 0.005126953125 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019712954112772956, + "lm_loss": 0.00482177734375, + "loss": 0.0082, + "step": 1870, + "total_loss": 0.00482177734375 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001971264850893523, + "lm_loss": 0.007232666015625, + "loss": 0.0084, + "step": 1871, + "total_loss": 0.007232666015625 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019712342744874635, + "lm_loss": 0.0076904296875, + "loss": 0.0078, + "step": 1872, + "total_loss": 0.0076904296875 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019712036820596215, + "lm_loss": 0.00579833984375, + "loss": 0.0074, + "step": 1873, + "total_loss": 0.00579833984375 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019711730736105007, + "lm_loss": 0.0030670166015625, + "loss": 0.0101, + "step": 1874, + "total_loss": 0.0030670166015625 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001971142449140607, + "lm_loss": 0.0115966796875, + "loss": 0.0086, + "step": 1875, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019711118086504457, + "lm_loss": 0.0155029296875, + "loss": 0.0077, + "step": 1876, + "total_loss": 0.0155029296875 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019710811521405215, + "lm_loss": 0.00628662109375, + "loss": 0.0081, + "step": 1877, + "total_loss": 0.00628662109375 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019710504796113405, + "lm_loss": 0.008056640625, + "loss": 0.0107, + "step": 1878, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019710197910634088, + "lm_loss": 0.006683349609375, + "loss": 0.0104, + "step": 1879, + "total_loss": 0.006683349609375 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019709890864972322, + "lm_loss": 0.0096435546875, + "loss": 0.0091, + "step": 1880, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019709583659133174, + "lm_loss": 0.0184326171875, + "loss": 0.0116, + "step": 1881, + "total_loss": 0.0184326171875 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001970927629312172, + "lm_loss": 0.005615234375, + "loss": 0.0087, + "step": 1882, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019708968766943017, + "lm_loss": 0.00640869140625, + "loss": 0.0086, + "step": 1883, + "total_loss": 0.00640869140625 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019708661080602147, + "lm_loss": 0.00738525390625, + "loss": 0.0083, + "step": 1884, + "total_loss": 0.00738525390625 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019708353234104182, + "lm_loss": 0.01055908203125, + "loss": 0.0078, + "step": 1885, + "total_loss": 0.01055908203125 + }, + { + "epoch": 0.77, + "learning_rate": 0.000197080452274542, + "lm_loss": 0.00860595703125, + "loss": 0.0089, + "step": 1886, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019707737060657287, + "lm_loss": 0.01116943359375, + "loss": 0.0086, + "step": 1887, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001970742873371852, + "lm_loss": 0.0081787109375, + "loss": 0.0099, + "step": 1888, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001970712024664299, + "lm_loss": 0.003143310546875, + "loss": 0.0082, + "step": 1889, + "total_loss": 0.003143310546875 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019706811599435784, + "lm_loss": 0.0057373046875, + "loss": 0.0093, + "step": 1890, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019706502792101992, + "lm_loss": 0.004913330078125, + "loss": 0.0072, + "step": 1891, + "total_loss": 0.004913330078125 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001970619382464671, + "lm_loss": 0.007659912109375, + "loss": 0.0072, + "step": 1892, + "total_loss": 0.007659912109375 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019705884697075035, + "lm_loss": 0.00469970703125, + "loss": 0.0065, + "step": 1893, + "total_loss": 0.00469970703125 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019705575409392068, + "lm_loss": 0.00616455078125, + "loss": 0.0075, + "step": 1894, + "total_loss": 0.00616455078125 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019705265961602906, + "lm_loss": 0.004638671875, + "loss": 0.0079, + "step": 1895, + "total_loss": 0.004638671875 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001970495635371266, + "lm_loss": 0.00274658203125, + "loss": 0.0074, + "step": 1896, + "total_loss": 0.00274658203125 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019704646585726434, + "lm_loss": 0.00909423828125, + "loss": 0.0086, + "step": 1897, + "total_loss": 0.00909423828125 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019704336657649335, + "lm_loss": 0.005279541015625, + "loss": 0.0093, + "step": 1898, + "total_loss": 0.005279541015625 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019704026569486484, + "lm_loss": 0.01031494140625, + "loss": 0.0071, + "step": 1899, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019703716321242988, + "lm_loss": 0.0022735595703125, + "loss": 0.0075, + "step": 1900, + "total_loss": 0.0022735595703125 + }, + { + "epoch": 0.78, + "eval_lm_loss": 0.010396833531558514, + "eval_loss": 0.010781650431454182, + "eval_runtime": 43.9963, + "eval_samples_per_second": 22.729, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.010396833531558514, + "lm_loss": 0.0005950927734375, + "step": 1900, + "total_loss": 0.0005950927734375 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019703405912923968, + "lm_loss": 0.00750732421875, + "loss": 0.0083, + "step": 1901, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001970309534453455, + "lm_loss": 0.01007080078125, + "loss": 0.0086, + "step": 1902, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019702784616079845, + "lm_loss": 0.005828857421875, + "loss": 0.0076, + "step": 1903, + "total_loss": 0.005828857421875 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019702473727564992, + "lm_loss": 0.0035552978515625, + "loss": 0.0097, + "step": 1904, + "total_loss": 0.0035552978515625 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019702162678995107, + "lm_loss": 0.004669189453125, + "loss": 0.0073, + "step": 1905, + "total_loss": 0.004669189453125 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019701851470375334, + "lm_loss": 0.01446533203125, + "loss": 0.0099, + "step": 1906, + "total_loss": 0.01446533203125 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019701540101710799, + "lm_loss": 0.013427734375, + "loss": 0.0089, + "step": 1907, + "total_loss": 0.013427734375 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001970122857300664, + "lm_loss": 0.007293701171875, + "loss": 0.0063, + "step": 1908, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019700916884267993, + "lm_loss": 0.00616455078125, + "loss": 0.0084, + "step": 1909, + "total_loss": 0.00616455078125 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019700605035500007, + "lm_loss": 0.005462646484375, + "loss": 0.0065, + "step": 1910, + "total_loss": 0.005462646484375 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019700293026707815, + "lm_loss": 0.010498046875, + "loss": 0.0088, + "step": 1911, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019699980857896578, + "lm_loss": 0.0106201171875, + "loss": 0.0093, + "step": 1912, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019699668529071435, + "lm_loss": 0.00567626953125, + "loss": 0.0068, + "step": 1913, + "total_loss": 0.00567626953125 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019699356040237545, + "lm_loss": 0.00640869140625, + "loss": 0.0072, + "step": 1914, + "total_loss": 0.00640869140625 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001969904339140006, + "lm_loss": 0.004486083984375, + "loss": 0.0095, + "step": 1915, + "total_loss": 0.004486083984375 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019698730582564134, + "lm_loss": 0.006011962890625, + "loss": 0.0076, + "step": 1916, + "total_loss": 0.006011962890625 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019698417613734934, + "lm_loss": 0.0025177001953125, + "loss": 0.0087, + "step": 1917, + "total_loss": 0.0025177001953125 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019698104484917617, + "lm_loss": 0.00860595703125, + "loss": 0.0079, + "step": 1918, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019697791196117352, + "lm_loss": 0.0123291015625, + "loss": 0.0106, + "step": 1919, + "total_loss": 0.0123291015625 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019697477747339305, + "lm_loss": 0.018310546875, + "loss": 0.0088, + "step": 1920, + "total_loss": 0.018310546875 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001969716413858865, + "lm_loss": 0.01025390625, + "loss": 0.01, + "step": 1921, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019696850369870558, + "lm_loss": 0.01043701171875, + "loss": 0.0089, + "step": 1922, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.79, + "learning_rate": 0.000196965364411902, + "lm_loss": 0.01177978515625, + "loss": 0.0105, + "step": 1923, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019696222352552764, + "lm_loss": 0.0093994140625, + "loss": 0.0073, + "step": 1924, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001969590810396343, + "lm_loss": 0.01190185546875, + "loss": 0.0096, + "step": 1925, + "total_loss": 0.01190185546875 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019695593695427376, + "lm_loss": 0.0050048828125, + "loss": 0.0082, + "step": 1926, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019695279126949792, + "lm_loss": 0.0115966796875, + "loss": 0.01, + "step": 1927, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019694964398535864, + "lm_loss": 0.011474609375, + "loss": 0.0071, + "step": 1928, + "total_loss": 0.011474609375 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001969464951019079, + "lm_loss": 0.00946044921875, + "loss": 0.0086, + "step": 1929, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019694334461919764, + "lm_loss": 0.00860595703125, + "loss": 0.0078, + "step": 1930, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019694019253727976, + "lm_loss": 0.0084228515625, + "loss": 0.0073, + "step": 1931, + "total_loss": 0.0084228515625 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001969370388562063, + "lm_loss": 0.00787353515625, + "loss": 0.0089, + "step": 1932, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001969338835760293, + "lm_loss": 0.0034637451171875, + "loss": 0.0082, + "step": 1933, + "total_loss": 0.0034637451171875 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001969307266968008, + "lm_loss": 0.006622314453125, + "loss": 0.0086, + "step": 1934, + "total_loss": 0.006622314453125 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019692756821857287, + "lm_loss": 0.0036163330078125, + "loss": 0.0072, + "step": 1935, + "total_loss": 0.0036163330078125 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019692440814139762, + "lm_loss": 0.01165771484375, + "loss": 0.0099, + "step": 1936, + "total_loss": 0.01165771484375 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019692124646532717, + "lm_loss": 0.007293701171875, + "loss": 0.007, + "step": 1937, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001969180831904137, + "lm_loss": 0.00579833984375, + "loss": 0.0077, + "step": 1938, + "total_loss": 0.00579833984375 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019691491831670934, + "lm_loss": 0.0185546875, + "loss": 0.0086, + "step": 1939, + "total_loss": 0.0185546875 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019691175184426634, + "lm_loss": 0.0101318359375, + "loss": 0.0087, + "step": 1940, + "total_loss": 0.0101318359375 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019690858377313693, + "lm_loss": 0.00860595703125, + "loss": 0.0089, + "step": 1941, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001969054141033734, + "lm_loss": 0.0140380859375, + "loss": 0.0087, + "step": 1942, + "total_loss": 0.0140380859375 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019690224283502798, + "lm_loss": 0.00701904296875, + "loss": 0.0082, + "step": 1943, + "total_loss": 0.00701904296875 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019689906996815302, + "lm_loss": 0.006072998046875, + "loss": 0.0065, + "step": 1944, + "total_loss": 0.006072998046875 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019689589550280086, + "lm_loss": 0.0034332275390625, + "loss": 0.0073, + "step": 1945, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019689271943902384, + "lm_loss": 0.013916015625, + "loss": 0.0093, + "step": 1946, + "total_loss": 0.013916015625 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019688954177687437, + "lm_loss": 0.011474609375, + "loss": 0.0088, + "step": 1947, + "total_loss": 0.011474609375 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001968863625164049, + "lm_loss": 0.0118408203125, + "loss": 0.0097, + "step": 1948, + "total_loss": 0.0118408203125 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019688318165766782, + "lm_loss": 0.0030517578125, + "loss": 0.0091, + "step": 1949, + "total_loss": 0.0030517578125 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019687999920071564, + "lm_loss": 0.007568359375, + "loss": 0.01, + "step": 1950, + "total_loss": 0.007568359375 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019687681514560086, + "lm_loss": 0.012451171875, + "loss": 0.0107, + "step": 1951, + "total_loss": 0.012451171875 + }, + { + "epoch": 0.8, + "learning_rate": 0.000196873629492376, + "lm_loss": 0.007568359375, + "loss": 0.0108, + "step": 1952, + "total_loss": 0.007568359375 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019687044224109358, + "lm_loss": 0.004364013671875, + "loss": 0.0087, + "step": 1953, + "total_loss": 0.004364013671875 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019686725339180622, + "lm_loss": 0.004302978515625, + "loss": 0.007, + "step": 1954, + "total_loss": 0.004302978515625 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019686406294456651, + "lm_loss": 0.00958251953125, + "loss": 0.0107, + "step": 1955, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019686087089942707, + "lm_loss": 0.008544921875, + "loss": 0.0074, + "step": 1956, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001968576772564406, + "lm_loss": 0.005584716796875, + "loss": 0.0089, + "step": 1957, + "total_loss": 0.005584716796875 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001968544820156597, + "lm_loss": 0.01007080078125, + "loss": 0.0091, + "step": 1958, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019685128517713717, + "lm_loss": 0.01171875, + "loss": 0.0085, + "step": 1959, + "total_loss": 0.01171875 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019684808674092568, + "lm_loss": 0.01214599609375, + "loss": 0.0093, + "step": 1960, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019684488670707802, + "lm_loss": 0.00933837890625, + "loss": 0.0081, + "step": 1961, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019684168507564696, + "lm_loss": 0.011962890625, + "loss": 0.011, + "step": 1962, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001968384818466854, + "lm_loss": 0.0079345703125, + "loss": 0.0083, + "step": 1963, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019683527702024604, + "lm_loss": 0.01348876953125, + "loss": 0.01, + "step": 1964, + "total_loss": 0.01348876953125 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019683207059638185, + "lm_loss": 0.013916015625, + "loss": 0.0089, + "step": 1965, + "total_loss": 0.013916015625 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001968288625751457, + "lm_loss": 0.00537109375, + "loss": 0.0077, + "step": 1966, + "total_loss": 0.00537109375 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001968256529565905, + "lm_loss": 0.0067138671875, + "loss": 0.0092, + "step": 1967, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001968224417407692, + "lm_loss": 0.01068115234375, + "loss": 0.0076, + "step": 1968, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019681922892773477, + "lm_loss": 0.0059814453125, + "loss": 0.0068, + "step": 1969, + "total_loss": 0.0059814453125 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001968160145175402, + "lm_loss": 0.00592041015625, + "loss": 0.0086, + "step": 1970, + "total_loss": 0.00592041015625 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019681279851023855, + "lm_loss": 0.005218505859375, + "loss": 0.0075, + "step": 1971, + "total_loss": 0.005218505859375 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019680958090588285, + "lm_loss": 0.0086669921875, + "loss": 0.0086, + "step": 1972, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001968063617045262, + "lm_loss": 0.00958251953125, + "loss": 0.0099, + "step": 1973, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019680314090622165, + "lm_loss": 0.01483154296875, + "loss": 0.008, + "step": 1974, + "total_loss": 0.01483154296875 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001967999185110224, + "lm_loss": 0.0164794921875, + "loss": 0.0103, + "step": 1975, + "total_loss": 0.0164794921875 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019679669451898156, + "lm_loss": 0.0059814453125, + "loss": 0.0082, + "step": 1976, + "total_loss": 0.0059814453125 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019679346893015234, + "lm_loss": 0.00811767578125, + "loss": 0.0098, + "step": 1977, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019679024174458793, + "lm_loss": 0.003570556640625, + "loss": 0.0077, + "step": 1978, + "total_loss": 0.003570556640625 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001967870129623416, + "lm_loss": 0.007659912109375, + "loss": 0.0065, + "step": 1979, + "total_loss": 0.007659912109375 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019678378258346655, + "lm_loss": 0.004119873046875, + "loss": 0.0075, + "step": 1980, + "total_loss": 0.004119873046875 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019678055060801613, + "lm_loss": 0.011962890625, + "loss": 0.0082, + "step": 1981, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019677731703604363, + "lm_loss": 0.0057373046875, + "loss": 0.0066, + "step": 1982, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001967740818676024, + "lm_loss": 0.00433349609375, + "loss": 0.0082, + "step": 1983, + "total_loss": 0.00433349609375 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019677084510274581, + "lm_loss": 0.01031494140625, + "loss": 0.0078, + "step": 1984, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019676760674152725, + "lm_loss": 0.0140380859375, + "loss": 0.0101, + "step": 1985, + "total_loss": 0.0140380859375 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019676436678400013, + "lm_loss": 0.0074462890625, + "loss": 0.0074, + "step": 1986, + "total_loss": 0.0074462890625 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019676112523021793, + "lm_loss": 0.00665283203125, + "loss": 0.0086, + "step": 1987, + "total_loss": 0.00665283203125 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001967578820802341, + "lm_loss": 0.008056640625, + "loss": 0.0081, + "step": 1988, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001967546373341021, + "lm_loss": 0.01708984375, + "loss": 0.0086, + "step": 1989, + "total_loss": 0.01708984375 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019675139099187554, + "lm_loss": 0.0107421875, + "loss": 0.0082, + "step": 1990, + "total_loss": 0.0107421875 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019674814305360792, + "lm_loss": 0.006011962890625, + "loss": 0.0062, + "step": 1991, + "total_loss": 0.006011962890625 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019674489351935283, + "lm_loss": 0.0062255859375, + "loss": 0.0073, + "step": 1992, + "total_loss": 0.0062255859375 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019674164238916387, + "lm_loss": 0.006134033203125, + "loss": 0.0076, + "step": 1993, + "total_loss": 0.006134033203125 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001967383896630947, + "lm_loss": 0.01019287109375, + "loss": 0.0086, + "step": 1994, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001967351353411989, + "lm_loss": 0.01263427734375, + "loss": 0.0094, + "step": 1995, + "total_loss": 0.01263427734375 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019673187942353024, + "lm_loss": 0.007659912109375, + "loss": 0.0084, + "step": 1996, + "total_loss": 0.007659912109375 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001967286219101424, + "lm_loss": 0.0106201171875, + "loss": 0.0084, + "step": 1997, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019672536280108914, + "lm_loss": 0.00823974609375, + "loss": 0.0092, + "step": 1998, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019672210209642417, + "lm_loss": 0.0067138671875, + "loss": 0.0085, + "step": 1999, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001967188397962013, + "lm_loss": 0.00567626953125, + "loss": 0.0073, + "step": 2000, + "total_loss": 0.00567626953125 + }, + { + "epoch": 0.82, + "eval_lm_loss": 0.009454338811337948, + "eval_loss": 0.009905356913805008, + "eval_runtime": 43.8772, + "eval_samples_per_second": 22.791, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009454338811337948, + "lm_loss": 0.00183868408203125, + "step": 2000, + "total_loss": 0.00183868408203125 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001967155759004744, + "lm_loss": 0.0111083984375, + "loss": 0.0098, + "step": 2001, + "total_loss": 0.0111083984375 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001967123104092972, + "lm_loss": 0.01251220703125, + "loss": 0.01, + "step": 2002, + "total_loss": 0.01251220703125 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001967090433227237, + "lm_loss": 0.004119873046875, + "loss": 0.0094, + "step": 2003, + "total_loss": 0.004119873046875 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001967057746408077, + "lm_loss": 0.00439453125, + "loss": 0.009, + "step": 2004, + "total_loss": 0.00439453125 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019670250436360317, + "lm_loss": 0.005096435546875, + "loss": 0.0079, + "step": 2005, + "total_loss": 0.005096435546875 + }, + { + "epoch": 0.82, + "learning_rate": 0.000196699232491164, + "lm_loss": 0.003692626953125, + "loss": 0.0084, + "step": 2006, + "total_loss": 0.003692626953125 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019669595902354424, + "lm_loss": 0.00836181640625, + "loss": 0.0078, + "step": 2007, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019669268396079788, + "lm_loss": 0.01025390625, + "loss": 0.0094, + "step": 2008, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001966894073029789, + "lm_loss": 0.01708984375, + "loss": 0.0114, + "step": 2009, + "total_loss": 0.01708984375 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019668612905014135, + "lm_loss": 0.0118408203125, + "loss": 0.0086, + "step": 2010, + "total_loss": 0.0118408203125 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019668284920233933, + "lm_loss": 0.004638671875, + "loss": 0.0075, + "step": 2011, + "total_loss": 0.004638671875 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019667956775962697, + "lm_loss": 0.01171875, + "loss": 0.0073, + "step": 2012, + "total_loss": 0.01171875 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019667628472205836, + "lm_loss": 0.00506591796875, + "loss": 0.0086, + "step": 2013, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001966730000896877, + "lm_loss": 0.00970458984375, + "loss": 0.0077, + "step": 2014, + "total_loss": 0.00970458984375 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001966697138625691, + "lm_loss": 0.00506591796875, + "loss": 0.007, + "step": 2015, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019666642604075686, + "lm_loss": 0.0042724609375, + "loss": 0.0091, + "step": 2016, + "total_loss": 0.0042724609375 + }, + { + "epoch": 0.82, + "learning_rate": 0.0001966631366243052, + "lm_loss": 0.006805419921875, + "loss": 0.0072, + "step": 2017, + "total_loss": 0.006805419921875 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019665984561326833, + "lm_loss": 0.01031494140625, + "loss": 0.0076, + "step": 2018, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001966565530077006, + "lm_loss": 0.00787353515625, + "loss": 0.0084, + "step": 2019, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019665325880765627, + "lm_loss": 0.01953125, + "loss": 0.0084, + "step": 2020, + "total_loss": 0.01953125 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019664996301318972, + "lm_loss": 0.004547119140625, + "loss": 0.0091, + "step": 2021, + "total_loss": 0.004547119140625 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001966466656243553, + "lm_loss": 0.002777099609375, + "loss": 0.0086, + "step": 2022, + "total_loss": 0.002777099609375 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019664336664120745, + "lm_loss": 0.009033203125, + "loss": 0.0104, + "step": 2023, + "total_loss": 0.009033203125 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001966400660638005, + "lm_loss": 0.010498046875, + "loss": 0.0091, + "step": 2024, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019663676389218896, + "lm_loss": 0.009521484375, + "loss": 0.008, + "step": 2025, + "total_loss": 0.009521484375 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019663346012642734, + "lm_loss": 0.00482177734375, + "loss": 0.0073, + "step": 2026, + "total_loss": 0.00482177734375 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019663015476657005, + "lm_loss": 0.006072998046875, + "loss": 0.0086, + "step": 2027, + "total_loss": 0.006072998046875 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001966268478126717, + "lm_loss": 0.0125732421875, + "loss": 0.0076, + "step": 2028, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001966235392647868, + "lm_loss": 0.00848388671875, + "loss": 0.0098, + "step": 2029, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019662022912296986, + "lm_loss": 0.00921630859375, + "loss": 0.0093, + "step": 2030, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019661691738727564, + "lm_loss": 0.00506591796875, + "loss": 0.0079, + "step": 2031, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019661360405775867, + "lm_loss": 0.01080322265625, + "loss": 0.0092, + "step": 2032, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001966102891344736, + "lm_loss": 0.003326416015625, + "loss": 0.0101, + "step": 2033, + "total_loss": 0.003326416015625 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019660697261747516, + "lm_loss": 0.0057373046875, + "loss": 0.0093, + "step": 2034, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019660365450681805, + "lm_loss": 0.008056640625, + "loss": 0.0092, + "step": 2035, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.83, + "learning_rate": 0.000196600334802557, + "lm_loss": 0.0079345703125, + "loss": 0.0087, + "step": 2036, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019659701350474676, + "lm_loss": 0.007049560546875, + "loss": 0.0078, + "step": 2037, + "total_loss": 0.007049560546875 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019659369061344212, + "lm_loss": 0.0093994140625, + "loss": 0.0074, + "step": 2038, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001965903661286979, + "lm_loss": 0.00787353515625, + "loss": 0.0075, + "step": 2039, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019658704005056895, + "lm_loss": 0.007293701171875, + "loss": 0.0092, + "step": 2040, + "total_loss": 0.007293701171875 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019658371237911014, + "lm_loss": 0.00933837890625, + "loss": 0.0078, + "step": 2041, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001965803831143764, + "lm_loss": 0.0125732421875, + "loss": 0.0082, + "step": 2042, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019657705225642255, + "lm_loss": 0.01312255859375, + "loss": 0.0079, + "step": 2043, + "total_loss": 0.01312255859375 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019657371980530362, + "lm_loss": 0.01458740234375, + "loss": 0.0094, + "step": 2044, + "total_loss": 0.01458740234375 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019657038576107456, + "lm_loss": 0.0184326171875, + "loss": 0.0091, + "step": 2045, + "total_loss": 0.0184326171875 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019656705012379038, + "lm_loss": 0.00775146484375, + "loss": 0.0098, + "step": 2046, + "total_loss": 0.00775146484375 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019656371289350608, + "lm_loss": 0.01214599609375, + "loss": 0.0085, + "step": 2047, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019656037407027674, + "lm_loss": 0.00653076171875, + "loss": 0.0062, + "step": 2048, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019655703365415743, + "lm_loss": 0.006988525390625, + "loss": 0.009, + "step": 2049, + "total_loss": 0.006988525390625 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019655369164520324, + "lm_loss": 0.006866455078125, + "loss": 0.0087, + "step": 2050, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001965503480434693, + "lm_loss": 0.0037841796875, + "loss": 0.0096, + "step": 2051, + "total_loss": 0.0037841796875 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019654700284901078, + "lm_loss": 0.00762939453125, + "loss": 0.0092, + "step": 2052, + "total_loss": 0.00762939453125 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019654365606188289, + "lm_loss": 0.004241943359375, + "loss": 0.0069, + "step": 2053, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019654030768214076, + "lm_loss": 0.00982666015625, + "loss": 0.0093, + "step": 2054, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001965369577098397, + "lm_loss": 0.00830078125, + "loss": 0.0082, + "step": 2055, + "total_loss": 0.00830078125 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019653360614503495, + "lm_loss": 0.004974365234375, + "loss": 0.0087, + "step": 2056, + "total_loss": 0.004974365234375 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001965302529877818, + "lm_loss": 0.00848388671875, + "loss": 0.0077, + "step": 2057, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019652689823813557, + "lm_loss": 0.0069580078125, + "loss": 0.008, + "step": 2058, + "total_loss": 0.0069580078125 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019652354189615158, + "lm_loss": 0.004913330078125, + "loss": 0.0082, + "step": 2059, + "total_loss": 0.004913330078125 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019652018396188522, + "lm_loss": 0.0087890625, + "loss": 0.0079, + "step": 2060, + "total_loss": 0.0087890625 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019651682443539188, + "lm_loss": 0.0096435546875, + "loss": 0.0064, + "step": 2061, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019651346331672696, + "lm_loss": 0.00494384765625, + "loss": 0.0091, + "step": 2062, + "total_loss": 0.00494384765625 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001965101006059459, + "lm_loss": 0.01348876953125, + "loss": 0.0101, + "step": 2063, + "total_loss": 0.01348876953125 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019650673630310425, + "lm_loss": 0.00933837890625, + "loss": 0.0086, + "step": 2064, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019650337040825742, + "lm_loss": 0.00689697265625, + "loss": 0.0096, + "step": 2065, + "total_loss": 0.00689697265625 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019650000292146093, + "lm_loss": 0.0133056640625, + "loss": 0.0097, + "step": 2066, + "total_loss": 0.0133056640625 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019649663384277042, + "lm_loss": 0.01409912109375, + "loss": 0.0094, + "step": 2067, + "total_loss": 0.01409912109375 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019649326317224138, + "lm_loss": 0.0123291015625, + "loss": 0.0098, + "step": 2068, + "total_loss": 0.0123291015625 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019648989090992946, + "lm_loss": 0.01055908203125, + "loss": 0.0095, + "step": 2069, + "total_loss": 0.01055908203125 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019648651705589027, + "lm_loss": 0.011474609375, + "loss": 0.008, + "step": 2070, + "total_loss": 0.011474609375 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001964831416101795, + "lm_loss": 0.0130615234375, + "loss": 0.0098, + "step": 2071, + "total_loss": 0.0130615234375 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001964797645728528, + "lm_loss": 0.01214599609375, + "loss": 0.0093, + "step": 2072, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019647638594396587, + "lm_loss": 0.00738525390625, + "loss": 0.0084, + "step": 2073, + "total_loss": 0.00738525390625 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019647300572357446, + "lm_loss": 0.00933837890625, + "loss": 0.0085, + "step": 2074, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019646962391173432, + "lm_loss": 0.00653076171875, + "loss": 0.0071, + "step": 2075, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019646624050850123, + "lm_loss": 0.0027923583984375, + "loss": 0.01, + "step": 2076, + "total_loss": 0.0027923583984375 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019646285551393108, + "lm_loss": 0.0045166015625, + "loss": 0.0087, + "step": 2077, + "total_loss": 0.0045166015625 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001964594689280796, + "lm_loss": 0.0079345703125, + "loss": 0.0087, + "step": 2078, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019645608075100274, + "lm_loss": 0.00946044921875, + "loss": 0.0076, + "step": 2079, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019645269098275637, + "lm_loss": 0.013427734375, + "loss": 0.0077, + "step": 2080, + "total_loss": 0.013427734375 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019644929962339637, + "lm_loss": 0.00141143798828125, + "loss": 0.0069, + "step": 2081, + "total_loss": 0.00141143798828125 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019644590667297873, + "lm_loss": 0.01446533203125, + "loss": 0.0083, + "step": 2082, + "total_loss": 0.01446533203125 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001964425121315594, + "lm_loss": 0.007537841796875, + "loss": 0.0097, + "step": 2083, + "total_loss": 0.007537841796875 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019643911599919437, + "lm_loss": 0.0048828125, + "loss": 0.0089, + "step": 2084, + "total_loss": 0.0048828125 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001964357182759397, + "lm_loss": 0.0028228759765625, + "loss": 0.0079, + "step": 2085, + "total_loss": 0.0028228759765625 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019643231896185142, + "lm_loss": 0.01025390625, + "loss": 0.0093, + "step": 2086, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019642891805698556, + "lm_loss": 0.008056640625, + "loss": 0.0102, + "step": 2087, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001964255155613983, + "lm_loss": 0.007659912109375, + "loss": 0.0078, + "step": 2088, + "total_loss": 0.007659912109375 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001964221114751457, + "lm_loss": 0.00311279296875, + "loss": 0.0074, + "step": 2089, + "total_loss": 0.00311279296875 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019641870579828398, + "lm_loss": 0.010986328125, + "loss": 0.0102, + "step": 2090, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001964152985308693, + "lm_loss": 0.0048828125, + "loss": 0.008, + "step": 2091, + "total_loss": 0.0048828125 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019641188967295782, + "lm_loss": 0.0091552734375, + "loss": 0.0073, + "step": 2092, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019640847922460585, + "lm_loss": 0.01116943359375, + "loss": 0.0087, + "step": 2093, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019640506718586958, + "lm_loss": 0.00390625, + "loss": 0.0064, + "step": 2094, + "total_loss": 0.00390625 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019640165355680535, + "lm_loss": 0.005340576171875, + "loss": 0.0089, + "step": 2095, + "total_loss": 0.005340576171875 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019639823833746942, + "lm_loss": 0.006011962890625, + "loss": 0.0097, + "step": 2096, + "total_loss": 0.006011962890625 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019639482152791817, + "lm_loss": 0.0072021484375, + "loss": 0.0078, + "step": 2097, + "total_loss": 0.0072021484375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019639140312820795, + "lm_loss": 0.00994873046875, + "loss": 0.0091, + "step": 2098, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019638798313839515, + "lm_loss": 0.004150390625, + "loss": 0.0076, + "step": 2099, + "total_loss": 0.004150390625 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019638456155853619, + "lm_loss": 0.0057373046875, + "loss": 0.0064, + "step": 2100, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.86, + "eval_lm_loss": 0.00981560442596674, + "eval_loss": 0.010241355746984482, + "eval_runtime": 43.9945, + "eval_samples_per_second": 22.73, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.00981560442596674, + "lm_loss": 0.000827789306640625, + "step": 2100, + "total_loss": 0.000827789306640625 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019638113838868753, + "lm_loss": 0.01141357421875, + "loss": 0.0081, + "step": 2101, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001963777136289056, + "lm_loss": 0.010009765625, + "loss": 0.0095, + "step": 2102, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019637428727924694, + "lm_loss": 0.00616455078125, + "loss": 0.0083, + "step": 2103, + "total_loss": 0.00616455078125 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019637085933976804, + "lm_loss": 0.0079345703125, + "loss": 0.0086, + "step": 2104, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019636742981052544, + "lm_loss": 0.00994873046875, + "loss": 0.0084, + "step": 2105, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019636399869157577, + "lm_loss": 0.0031585693359375, + "loss": 0.0062, + "step": 2106, + "total_loss": 0.0031585693359375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019636056598297557, + "lm_loss": 0.00457763671875, + "loss": 0.0056, + "step": 2107, + "total_loss": 0.00457763671875 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019635713168478152, + "lm_loss": 0.006561279296875, + "loss": 0.0076, + "step": 2108, + "total_loss": 0.006561279296875 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001963536957970502, + "lm_loss": 0.00933837890625, + "loss": 0.0077, + "step": 2109, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019635025831983834, + "lm_loss": 0.010009765625, + "loss": 0.0085, + "step": 2110, + "total_loss": 0.010009765625 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019634681925320265, + "lm_loss": 0.01080322265625, + "loss": 0.0082, + "step": 2111, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001963433785971999, + "lm_loss": 0.0096435546875, + "loss": 0.0081, + "step": 2112, + "total_loss": 0.0096435546875 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019633993635188675, + "lm_loss": 0.0130615234375, + "loss": 0.0083, + "step": 2113, + "total_loss": 0.0130615234375 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019633649251732006, + "lm_loss": 0.0106201171875, + "loss": 0.0083, + "step": 2114, + "total_loss": 0.0106201171875 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019633304709355657, + "lm_loss": 0.005615234375, + "loss": 0.0071, + "step": 2115, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001963296000806532, + "lm_loss": 0.00823974609375, + "loss": 0.0084, + "step": 2116, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019632615147866677, + "lm_loss": 0.0079345703125, + "loss": 0.0089, + "step": 2117, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019632270128765416, + "lm_loss": 0.00677490234375, + "loss": 0.0084, + "step": 2118, + "total_loss": 0.00677490234375 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001963192495076723, + "lm_loss": 0.0050048828125, + "loss": 0.0062, + "step": 2119, + "total_loss": 0.0050048828125 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001963157961387782, + "lm_loss": 0.026123046875, + "loss": 0.0099, + "step": 2120, + "total_loss": 0.026123046875 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019631234118102866, + "lm_loss": 0.0067138671875, + "loss": 0.0099, + "step": 2121, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001963088846344808, + "lm_loss": 0.00445556640625, + "loss": 0.0081, + "step": 2122, + "total_loss": 0.00445556640625 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019630542649919165, + "lm_loss": 0.0093994140625, + "loss": 0.0092, + "step": 2123, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001963019667752182, + "lm_loss": 0.003204345703125, + "loss": 0.0079, + "step": 2124, + "total_loss": 0.003204345703125 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019629850546261753, + "lm_loss": 0.010498046875, + "loss": 0.0082, + "step": 2125, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019629504256144675, + "lm_loss": 0.01092529296875, + "loss": 0.0096, + "step": 2126, + "total_loss": 0.01092529296875 + }, + { + "epoch": 0.87, + "learning_rate": 0.000196291578071763, + "lm_loss": 0.00836181640625, + "loss": 0.0101, + "step": 2127, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019628811199362342, + "lm_loss": 0.006561279296875, + "loss": 0.0076, + "step": 2128, + "total_loss": 0.006561279296875 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019628464432708517, + "lm_loss": 0.00518798828125, + "loss": 0.0075, + "step": 2129, + "total_loss": 0.00518798828125 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019628117507220546, + "lm_loss": 0.005706787109375, + "loss": 0.008, + "step": 2130, + "total_loss": 0.005706787109375 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019627770422904158, + "lm_loss": 0.00567626953125, + "loss": 0.0072, + "step": 2131, + "total_loss": 0.00567626953125 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001962742317976507, + "lm_loss": 0.00750732421875, + "loss": 0.0079, + "step": 2132, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019627075777809013, + "lm_loss": 0.003662109375, + "loss": 0.0089, + "step": 2133, + "total_loss": 0.003662109375 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019626728217041718, + "lm_loss": 0.00823974609375, + "loss": 0.008, + "step": 2134, + "total_loss": 0.00823974609375 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019626380497468922, + "lm_loss": 0.00982666015625, + "loss": 0.0087, + "step": 2135, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019626032619096358, + "lm_loss": 0.00335693359375, + "loss": 0.0085, + "step": 2136, + "total_loss": 0.00335693359375 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019625684581929762, + "lm_loss": 0.00360107421875, + "loss": 0.0068, + "step": 2137, + "total_loss": 0.00360107421875 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001962533638597488, + "lm_loss": 0.0079345703125, + "loss": 0.0073, + "step": 2138, + "total_loss": 0.0079345703125 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019624988031237455, + "lm_loss": 0.01031494140625, + "loss": 0.0071, + "step": 2139, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019624639517723232, + "lm_loss": 0.006378173828125, + "loss": 0.0097, + "step": 2140, + "total_loss": 0.006378173828125 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019624290845437958, + "lm_loss": 0.00872802734375, + "loss": 0.0075, + "step": 2141, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019623942014387392, + "lm_loss": 0.005584716796875, + "loss": 0.0069, + "step": 2142, + "total_loss": 0.005584716796875 + }, + { + "epoch": 0.88, + "learning_rate": 0.0001962359302457728, + "lm_loss": 0.01214599609375, + "loss": 0.0091, + "step": 2143, + "total_loss": 0.01214599609375 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019623243876013387, + "lm_loss": 0.005828857421875, + "loss": 0.0073, + "step": 2144, + "total_loss": 0.005828857421875 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019622894568701465, + "lm_loss": 0.0162353515625, + "loss": 0.0087, + "step": 2145, + "total_loss": 0.0162353515625 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019622545102647286, + "lm_loss": 0.00994873046875, + "loss": 0.0082, + "step": 2146, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019622195477856603, + "lm_loss": 0.0068359375, + "loss": 0.0077, + "step": 2147, + "total_loss": 0.0068359375 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019621845694335193, + "lm_loss": 0.01300048828125, + "loss": 0.008, + "step": 2148, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.88, + "learning_rate": 0.0001962149575208882, + "lm_loss": 0.00958251953125, + "loss": 0.0084, + "step": 2149, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.88, + "learning_rate": 0.0001962114565112326, + "lm_loss": 0.0164794921875, + "loss": 0.0094, + "step": 2150, + "total_loss": 0.0164794921875 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019620795391444288, + "lm_loss": 0.01422119140625, + "loss": 0.0076, + "step": 2151, + "total_loss": 0.01422119140625 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019620444973057684, + "lm_loss": 0.01507568359375, + "loss": 0.0073, + "step": 2152, + "total_loss": 0.01507568359375 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019620094395969226, + "lm_loss": 0.00408935546875, + "loss": 0.0083, + "step": 2153, + "total_loss": 0.00408935546875 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019619743660184694, + "lm_loss": 0.01068115234375, + "loss": 0.0097, + "step": 2154, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.88, + "learning_rate": 0.0001961939276570988, + "lm_loss": 0.006317138671875, + "loss": 0.008, + "step": 2155, + "total_loss": 0.006317138671875 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019619041712550572, + "lm_loss": 0.00958251953125, + "loss": 0.0081, + "step": 2156, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019618690500712558, + "lm_loss": 0.00506591796875, + "loss": 0.0062, + "step": 2157, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019618339130201632, + "lm_loss": 0.005523681640625, + "loss": 0.0079, + "step": 2158, + "total_loss": 0.005523681640625 + }, + { + "epoch": 0.88, + "learning_rate": 0.0001961798760102359, + "lm_loss": 0.006988525390625, + "loss": 0.0081, + "step": 2159, + "total_loss": 0.006988525390625 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019617635913184232, + "lm_loss": 0.005889892578125, + "loss": 0.0078, + "step": 2160, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.88, + "learning_rate": 0.0001961728406668936, + "lm_loss": 0.002899169921875, + "loss": 0.0073, + "step": 2161, + "total_loss": 0.002899169921875 + }, + { + "epoch": 0.88, + "learning_rate": 0.0001961693206154478, + "lm_loss": 0.007171630859375, + "loss": 0.0087, + "step": 2162, + "total_loss": 0.007171630859375 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019616579897756292, + "lm_loss": 0.010498046875, + "loss": 0.0077, + "step": 2163, + "total_loss": 0.010498046875 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019616227575329712, + "lm_loss": 0.00726318359375, + "loss": 0.0071, + "step": 2164, + "total_loss": 0.00726318359375 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001961587509427085, + "lm_loss": 0.005615234375, + "loss": 0.0074, + "step": 2165, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001961552245458552, + "lm_loss": 0.0174560546875, + "loss": 0.0071, + "step": 2166, + "total_loss": 0.0174560546875 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001961516965627954, + "lm_loss": 0.01019287109375, + "loss": 0.0083, + "step": 2167, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001961481669935873, + "lm_loss": 0.0081787109375, + "loss": 0.0073, + "step": 2168, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019614463583828915, + "lm_loss": 0.00836181640625, + "loss": 0.0102, + "step": 2169, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019614110309695916, + "lm_loss": 0.006134033203125, + "loss": 0.0065, + "step": 2170, + "total_loss": 0.006134033203125 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019613756876965557, + "lm_loss": 0.00946044921875, + "loss": 0.0087, + "step": 2171, + "total_loss": 0.00946044921875 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019613403285643677, + "lm_loss": 0.00897216796875, + "loss": 0.0085, + "step": 2172, + "total_loss": 0.00897216796875 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019613049535736107, + "lm_loss": 0.004180908203125, + "loss": 0.0098, + "step": 2173, + "total_loss": 0.004180908203125 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001961269562724868, + "lm_loss": 0.00506591796875, + "loss": 0.0084, + "step": 2174, + "total_loss": 0.00506591796875 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001961234156018723, + "lm_loss": 0.01129150390625, + "loss": 0.0102, + "step": 2175, + "total_loss": 0.01129150390625 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019611987334557609, + "lm_loss": 0.024169921875, + "loss": 0.0094, + "step": 2176, + "total_loss": 0.024169921875 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019611632950365651, + "lm_loss": 0.007415771484375, + "loss": 0.0081, + "step": 2177, + "total_loss": 0.007415771484375 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019611278407617207, + "lm_loss": 0.00665283203125, + "loss": 0.0085, + "step": 2178, + "total_loss": 0.00665283203125 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019610923706318124, + "lm_loss": 0.005126953125, + "loss": 0.0071, + "step": 2179, + "total_loss": 0.005126953125 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001961056884647425, + "lm_loss": 0.00653076171875, + "loss": 0.0098, + "step": 2180, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019610213828091447, + "lm_loss": 0.0146484375, + "loss": 0.0108, + "step": 2181, + "total_loss": 0.0146484375 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019609858651175563, + "lm_loss": 0.01129150390625, + "loss": 0.0113, + "step": 2182, + "total_loss": 0.01129150390625 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019609503315732462, + "lm_loss": 0.00396728515625, + "loss": 0.0063, + "step": 2183, + "total_loss": 0.00396728515625 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019609147821768002, + "lm_loss": 0.012451171875, + "loss": 0.0084, + "step": 2184, + "total_loss": 0.012451171875 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019608792169288053, + "lm_loss": 0.01324462890625, + "loss": 0.0087, + "step": 2185, + "total_loss": 0.01324462890625 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019608436358298475, + "lm_loss": 0.00787353515625, + "loss": 0.008, + "step": 2186, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019608080388805146, + "lm_loss": 0.0123291015625, + "loss": 0.0081, + "step": 2187, + "total_loss": 0.0123291015625 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001960772426081393, + "lm_loss": 0.01202392578125, + "loss": 0.0093, + "step": 2188, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019607367974330706, + "lm_loss": 0.005889892578125, + "loss": 0.0072, + "step": 2189, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001960701152936135, + "lm_loss": 0.010986328125, + "loss": 0.0094, + "step": 2190, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019606654925911744, + "lm_loss": 0.00885009765625, + "loss": 0.0085, + "step": 2191, + "total_loss": 0.00885009765625 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001960629816398777, + "lm_loss": 0.01409912109375, + "loss": 0.0086, + "step": 2192, + "total_loss": 0.01409912109375 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019605941243595312, + "lm_loss": 0.00408935546875, + "loss": 0.0082, + "step": 2193, + "total_loss": 0.00408935546875 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001960558416474026, + "lm_loss": 0.00848388671875, + "loss": 0.0101, + "step": 2194, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019605226927428497, + "lm_loss": 0.01312255859375, + "loss": 0.0089, + "step": 2195, + "total_loss": 0.01312255859375 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019604869531665925, + "lm_loss": 0.01031494140625, + "loss": 0.0104, + "step": 2196, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019604511977458437, + "lm_loss": 0.00726318359375, + "loss": 0.0072, + "step": 2197, + "total_loss": 0.00726318359375 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019604154264811933, + "lm_loss": 0.004638671875, + "loss": 0.006, + "step": 2198, + "total_loss": 0.004638671875 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001960379639373231, + "lm_loss": 0.006866455078125, + "loss": 0.0078, + "step": 2199, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019603438364225475, + "lm_loss": 0.0093994140625, + "loss": 0.008, + "step": 2200, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.9, + "eval_lm_loss": 0.009770958684384823, + "eval_loss": 0.010218048468232155, + "eval_runtime": 43.9063, + "eval_samples_per_second": 22.776, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009770958684384823, + "lm_loss": 0.0012969970703125, + "step": 2200, + "total_loss": 0.0012969970703125 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019603080176297332, + "lm_loss": 0.01300048828125, + "loss": 0.0095, + "step": 2201, + "total_loss": 0.01300048828125 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019602721829953788, + "lm_loss": 0.0166015625, + "loss": 0.0084, + "step": 2202, + "total_loss": 0.0166015625 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001960236332520076, + "lm_loss": 0.00787353515625, + "loss": 0.0089, + "step": 2203, + "total_loss": 0.00787353515625 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001960200466204416, + "lm_loss": 0.004730224609375, + "loss": 0.009, + "step": 2204, + "total_loss": 0.004730224609375 + }, + { + "epoch": 0.9, + "learning_rate": 0.000196016458404899, + "lm_loss": 0.004425048828125, + "loss": 0.0093, + "step": 2205, + "total_loss": 0.004425048828125 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019601286860543903, + "lm_loss": 0.0140380859375, + "loss": 0.0083, + "step": 2206, + "total_loss": 0.0140380859375 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019600927722212094, + "lm_loss": 0.0135498046875, + "loss": 0.0116, + "step": 2207, + "total_loss": 0.0135498046875 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019600568425500389, + "lm_loss": 0.0093994140625, + "loss": 0.0079, + "step": 2208, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019600208970414724, + "lm_loss": 0.006195068359375, + "loss": 0.0081, + "step": 2209, + "total_loss": 0.006195068359375 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019599849356961025, + "lm_loss": 0.00677490234375, + "loss": 0.0076, + "step": 2210, + "total_loss": 0.00677490234375 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019599489585145222, + "lm_loss": 0.00982666015625, + "loss": 0.0074, + "step": 2211, + "total_loss": 0.00982666015625 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019599129654973254, + "lm_loss": 0.004852294921875, + "loss": 0.0076, + "step": 2212, + "total_loss": 0.004852294921875 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019598769566451054, + "lm_loss": 0.0057373046875, + "loss": 0.0065, + "step": 2213, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019598409319584567, + "lm_loss": 0.00494384765625, + "loss": 0.0081, + "step": 2214, + "total_loss": 0.00494384765625 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019598048914379733, + "lm_loss": 0.007781982421875, + "loss": 0.0097, + "step": 2215, + "total_loss": 0.007781982421875 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019597688350842494, + "lm_loss": 0.00677490234375, + "loss": 0.0075, + "step": 2216, + "total_loss": 0.00677490234375 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001959732762897881, + "lm_loss": 0.0093994140625, + "loss": 0.0075, + "step": 2217, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019596966748794615, + "lm_loss": 0.011962890625, + "loss": 0.0086, + "step": 2218, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019596605710295873, + "lm_loss": 0.004119873046875, + "loss": 0.0078, + "step": 2219, + "total_loss": 0.004119873046875 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019596244513488538, + "lm_loss": 0.005615234375, + "loss": 0.0076, + "step": 2220, + "total_loss": 0.005615234375 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001959588315837857, + "lm_loss": 0.01190185546875, + "loss": 0.0079, + "step": 2221, + "total_loss": 0.01190185546875 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019595521644971925, + "lm_loss": 0.0027618408203125, + "loss": 0.0078, + "step": 2222, + "total_loss": 0.0027618408203125 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019595159973274568, + "lm_loss": 0.0038299560546875, + "loss": 0.0083, + "step": 2223, + "total_loss": 0.0038299560546875 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019594798143292467, + "lm_loss": 0.007568359375, + "loss": 0.0068, + "step": 2224, + "total_loss": 0.007568359375 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019594436155031595, + "lm_loss": 0.0034942626953125, + "loss": 0.0084, + "step": 2225, + "total_loss": 0.0034942626953125 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019594074008497914, + "lm_loss": 0.0045166015625, + "loss": 0.009, + "step": 2226, + "total_loss": 0.0045166015625 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019593711703697406, + "lm_loss": 0.004974365234375, + "loss": 0.0082, + "step": 2227, + "total_loss": 0.004974365234375 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019593349240636042, + "lm_loss": 0.00836181640625, + "loss": 0.0079, + "step": 2228, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019592986619319807, + "lm_loss": 0.00738525390625, + "loss": 0.009, + "step": 2229, + "total_loss": 0.00738525390625 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001959262383975468, + "lm_loss": 0.007537841796875, + "loss": 0.007, + "step": 2230, + "total_loss": 0.007537841796875 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001959226090194664, + "lm_loss": 0.006195068359375, + "loss": 0.0085, + "step": 2231, + "total_loss": 0.006195068359375 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019591897805901684, + "lm_loss": 0.01348876953125, + "loss": 0.0081, + "step": 2232, + "total_loss": 0.01348876953125 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019591534551625798, + "lm_loss": 0.01348876953125, + "loss": 0.009, + "step": 2233, + "total_loss": 0.01348876953125 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019591171139124973, + "lm_loss": 0.003570556640625, + "loss": 0.0088, + "step": 2234, + "total_loss": 0.003570556640625 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019590807568405202, + "lm_loss": 0.01171875, + "loss": 0.0082, + "step": 2235, + "total_loss": 0.01171875 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001959044383947249, + "lm_loss": 0.00701904296875, + "loss": 0.0088, + "step": 2236, + "total_loss": 0.00701904296875 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019590079952332829, + "lm_loss": 0.00537109375, + "loss": 0.0086, + "step": 2237, + "total_loss": 0.00537109375 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019589715906992224, + "lm_loss": 0.009765625, + "loss": 0.0079, + "step": 2238, + "total_loss": 0.009765625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019589351703456687, + "lm_loss": 0.00836181640625, + "loss": 0.0097, + "step": 2239, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019588987341732217, + "lm_loss": 0.005035400390625, + "loss": 0.0085, + "step": 2240, + "total_loss": 0.005035400390625 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001958862282182483, + "lm_loss": 0.006927490234375, + "loss": 0.0085, + "step": 2241, + "total_loss": 0.006927490234375 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019588258143740536, + "lm_loss": 0.00994873046875, + "loss": 0.0068, + "step": 2242, + "total_loss": 0.00994873046875 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019587893307485352, + "lm_loss": 0.0036163330078125, + "loss": 0.0096, + "step": 2243, + "total_loss": 0.0036163330078125 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019587528313065296, + "lm_loss": 0.01177978515625, + "loss": 0.0074, + "step": 2244, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019587163160486392, + "lm_loss": 0.00518798828125, + "loss": 0.0076, + "step": 2245, + "total_loss": 0.00518798828125 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019586797849754662, + "lm_loss": 0.006011962890625, + "loss": 0.0066, + "step": 2246, + "total_loss": 0.006011962890625 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001958643238087613, + "lm_loss": 0.0093994140625, + "loss": 0.0091, + "step": 2247, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019586066753856827, + "lm_loss": 0.01116943359375, + "loss": 0.0098, + "step": 2248, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019585700968702784, + "lm_loss": 0.004852294921875, + "loss": 0.0076, + "step": 2249, + "total_loss": 0.004852294921875 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019585335025420038, + "lm_loss": 0.017822265625, + "loss": 0.0072, + "step": 2250, + "total_loss": 0.017822265625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019584968924014619, + "lm_loss": 0.00634765625, + "loss": 0.0095, + "step": 2251, + "total_loss": 0.00634765625 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001958460266449257, + "lm_loss": 0.00640869140625, + "loss": 0.0085, + "step": 2252, + "total_loss": 0.00640869140625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019584236246859938, + "lm_loss": 0.013427734375, + "loss": 0.0085, + "step": 2253, + "total_loss": 0.013427734375 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001958386967112276, + "lm_loss": 0.004119873046875, + "loss": 0.0081, + "step": 2254, + "total_loss": 0.004119873046875 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019583502937287086, + "lm_loss": 0.008056640625, + "loss": 0.0084, + "step": 2255, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019583136045358965, + "lm_loss": 0.006256103515625, + "loss": 0.0102, + "step": 2256, + "total_loss": 0.006256103515625 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001958276899534445, + "lm_loss": 0.004791259765625, + "loss": 0.0079, + "step": 2257, + "total_loss": 0.004791259765625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019582401787249598, + "lm_loss": 0.01202392578125, + "loss": 0.0107, + "step": 2258, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019582034421080464, + "lm_loss": 0.007232666015625, + "loss": 0.0074, + "step": 2259, + "total_loss": 0.007232666015625 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001958166689684311, + "lm_loss": 0.0036773681640625, + "loss": 0.0085, + "step": 2260, + "total_loss": 0.0036773681640625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019581299214543595, + "lm_loss": 0.00933837890625, + "loss": 0.0084, + "step": 2261, + "total_loss": 0.00933837890625 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019580931374187988, + "lm_loss": 0.0101318359375, + "loss": 0.0079, + "step": 2262, + "total_loss": 0.0101318359375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019580563375782358, + "lm_loss": 0.0069580078125, + "loss": 0.008, + "step": 2263, + "total_loss": 0.0069580078125 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001958019521933277, + "lm_loss": 0.006439208984375, + "loss": 0.0077, + "step": 2264, + "total_loss": 0.006439208984375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019579826904845302, + "lm_loss": 0.00750732421875, + "loss": 0.0066, + "step": 2265, + "total_loss": 0.00750732421875 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001957945843232603, + "lm_loss": 0.016357421875, + "loss": 0.0094, + "step": 2266, + "total_loss": 0.016357421875 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001957908980178103, + "lm_loss": 0.004241943359375, + "loss": 0.009, + "step": 2267, + "total_loss": 0.004241943359375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019578721013216384, + "lm_loss": 0.005584716796875, + "loss": 0.0087, + "step": 2268, + "total_loss": 0.005584716796875 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019578352066638175, + "lm_loss": 0.0125732421875, + "loss": 0.0098, + "step": 2269, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019577982962052493, + "lm_loss": 0.0052490234375, + "loss": 0.007, + "step": 2270, + "total_loss": 0.0052490234375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019577613699465422, + "lm_loss": 0.00726318359375, + "loss": 0.0088, + "step": 2271, + "total_loss": 0.00726318359375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019577244278883057, + "lm_loss": 0.01275634765625, + "loss": 0.0075, + "step": 2272, + "total_loss": 0.01275634765625 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019576874700311488, + "lm_loss": 0.005462646484375, + "loss": 0.0083, + "step": 2273, + "total_loss": 0.005462646484375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019576504963756818, + "lm_loss": 0.00927734375, + "loss": 0.0086, + "step": 2274, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019576135069225136, + "lm_loss": 0.006622314453125, + "loss": 0.0069, + "step": 2275, + "total_loss": 0.006622314453125 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019575765016722555, + "lm_loss": 0.00421142578125, + "loss": 0.0082, + "step": 2276, + "total_loss": 0.00421142578125 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019575394806255173, + "lm_loss": 0.005523681640625, + "loss": 0.0089, + "step": 2277, + "total_loss": 0.005523681640625 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019575024437829103, + "lm_loss": 0.0057373046875, + "loss": 0.0068, + "step": 2278, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019574653911450446, + "lm_loss": 0.01251220703125, + "loss": 0.0079, + "step": 2279, + "total_loss": 0.01251220703125 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019574283227125321, + "lm_loss": 0.011474609375, + "loss": 0.0096, + "step": 2280, + "total_loss": 0.011474609375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019573912384859843, + "lm_loss": 0.01068115234375, + "loss": 0.008, + "step": 2281, + "total_loss": 0.01068115234375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019573541384660124, + "lm_loss": 0.005889892578125, + "loss": 0.0073, + "step": 2282, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001957317022653229, + "lm_loss": 0.006256103515625, + "loss": 0.0078, + "step": 2283, + "total_loss": 0.006256103515625 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019572798910482458, + "lm_loss": 0.0120849609375, + "loss": 0.0079, + "step": 2284, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001957242743651676, + "lm_loss": 0.0042724609375, + "loss": 0.0085, + "step": 2285, + "total_loss": 0.0042724609375 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019572055804641317, + "lm_loss": 0.01031494140625, + "loss": 0.0082, + "step": 2286, + "total_loss": 0.01031494140625 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019571684014862268, + "lm_loss": 0.001312255859375, + "loss": 0.0089, + "step": 2287, + "total_loss": 0.001312255859375 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019571312067185738, + "lm_loss": 0.0033416748046875, + "loss": 0.0102, + "step": 2288, + "total_loss": 0.0033416748046875 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019570939961617866, + "lm_loss": 0.01141357421875, + "loss": 0.0094, + "step": 2289, + "total_loss": 0.01141357421875 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019570567698164792, + "lm_loss": 0.0091552734375, + "loss": 0.0079, + "step": 2290, + "total_loss": 0.0091552734375 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019570195276832656, + "lm_loss": 0.00653076171875, + "loss": 0.0058, + "step": 2291, + "total_loss": 0.00653076171875 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019569822697627597, + "lm_loss": 0.008544921875, + "loss": 0.0074, + "step": 2292, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001956944996055577, + "lm_loss": 0.0179443359375, + "loss": 0.0081, + "step": 2293, + "total_loss": 0.0179443359375 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019569077065623316, + "lm_loss": 0.00958251953125, + "loss": 0.0084, + "step": 2294, + "total_loss": 0.00958251953125 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019568704012836393, + "lm_loss": 0.01470947265625, + "loss": 0.0085, + "step": 2295, + "total_loss": 0.01470947265625 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019568330802201148, + "lm_loss": 0.01007080078125, + "loss": 0.0073, + "step": 2296, + "total_loss": 0.01007080078125 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001956795743372374, + "lm_loss": 0.007354736328125, + "loss": 0.0061, + "step": 2297, + "total_loss": 0.007354736328125 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019567583907410333, + "lm_loss": 0.0026397705078125, + "loss": 0.0085, + "step": 2298, + "total_loss": 0.0026397705078125 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001956721022326708, + "lm_loss": 0.007537841796875, + "loss": 0.0057, + "step": 2299, + "total_loss": 0.007537841796875 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019566836381300156, + "lm_loss": 0.00836181640625, + "loss": 0.007, + "step": 2300, + "total_loss": 0.00836181640625 + }, + { + "epoch": 0.94, + "eval_lm_loss": 0.00943505298346281, + "eval_loss": 0.009795456193387508, + "eval_runtime": 43.8801, + "eval_samples_per_second": 22.789, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.00943505298346281, + "lm_loss": 0.0010986328125, + "step": 2300, + "total_loss": 0.0010986328125 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019566462381515718, + "lm_loss": 0.004180908203125, + "loss": 0.0064, + "step": 2301, + "total_loss": 0.004180908203125 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019566088223919943, + "lm_loss": 0.006103515625, + "loss": 0.008, + "step": 2302, + "total_loss": 0.006103515625 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019565713908518997, + "lm_loss": 0.01116943359375, + "loss": 0.0093, + "step": 2303, + "total_loss": 0.01116943359375 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019565339435319062, + "lm_loss": 0.00396728515625, + "loss": 0.0089, + "step": 2304, + "total_loss": 0.00396728515625 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001956496480432631, + "lm_loss": 0.005523681640625, + "loss": 0.01, + "step": 2305, + "total_loss": 0.005523681640625 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019564590015546922, + "lm_loss": 0.005889892578125, + "loss": 0.0108, + "step": 2306, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001956421506898708, + "lm_loss": 0.00469970703125, + "loss": 0.0064, + "step": 2307, + "total_loss": 0.00469970703125 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019563839964652972, + "lm_loss": 0.0074462890625, + "loss": 0.0085, + "step": 2308, + "total_loss": 0.0074462890625 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019563464702550786, + "lm_loss": 0.007720947265625, + "loss": 0.0097, + "step": 2309, + "total_loss": 0.007720947265625 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019563089282686707, + "lm_loss": 0.005096435546875, + "loss": 0.0081, + "step": 2310, + "total_loss": 0.005096435546875 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019562713705066932, + "lm_loss": 0.0052490234375, + "loss": 0.0076, + "step": 2311, + "total_loss": 0.0052490234375 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019562337969697657, + "lm_loss": 0.0118408203125, + "loss": 0.0083, + "step": 2312, + "total_loss": 0.0118408203125 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019561962076585082, + "lm_loss": 0.003509521484375, + "loss": 0.0077, + "step": 2313, + "total_loss": 0.003509521484375 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019561586025735404, + "lm_loss": 0.0025177001953125, + "loss": 0.0089, + "step": 2314, + "total_loss": 0.0025177001953125 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019561209817154827, + "lm_loss": 0.002044677734375, + "loss": 0.0074, + "step": 2315, + "total_loss": 0.002044677734375 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019560833450849558, + "lm_loss": 0.00634765625, + "loss": 0.0079, + "step": 2316, + "total_loss": 0.00634765625 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019560456926825804, + "lm_loss": 0.0068359375, + "loss": 0.0081, + "step": 2317, + "total_loss": 0.0068359375 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019560080245089781, + "lm_loss": 0.00439453125, + "loss": 0.0081, + "step": 2318, + "total_loss": 0.00439453125 + }, + { + "epoch": 0.95, + "learning_rate": 0.000195597034056477, + "lm_loss": 0.0035247802734375, + "loss": 0.0076, + "step": 2319, + "total_loss": 0.0035247802734375 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019559326408505773, + "lm_loss": 0.01287841796875, + "loss": 0.0096, + "step": 2320, + "total_loss": 0.01287841796875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0001955894925367023, + "lm_loss": 0.0103759765625, + "loss": 0.0086, + "step": 2321, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.95, + "learning_rate": 0.0001955857194114728, + "lm_loss": 0.006866455078125, + "loss": 0.0082, + "step": 2322, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019558194470943154, + "lm_loss": 0.0086669921875, + "loss": 0.0086, + "step": 2323, + "total_loss": 0.0086669921875 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019557816843064082, + "lm_loss": 0.002471923828125, + "loss": 0.0078, + "step": 2324, + "total_loss": 0.002471923828125 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019557439057516285, + "lm_loss": 0.011962890625, + "loss": 0.0085, + "step": 2325, + "total_loss": 0.011962890625 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019557061114306, + "lm_loss": 0.0040283203125, + "loss": 0.0084, + "step": 2326, + "total_loss": 0.0040283203125 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019556683013439465, + "lm_loss": 0.007232666015625, + "loss": 0.0082, + "step": 2327, + "total_loss": 0.007232666015625 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019556304754922912, + "lm_loss": 0.008056640625, + "loss": 0.0072, + "step": 2328, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.95, + "learning_rate": 0.0001955592633876258, + "lm_loss": 0.00592041015625, + "loss": 0.0087, + "step": 2329, + "total_loss": 0.00592041015625 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019555547764964714, + "lm_loss": 0.00860595703125, + "loss": 0.0087, + "step": 2330, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019555169033535564, + "lm_loss": 0.008056640625, + "loss": 0.0083, + "step": 2331, + "total_loss": 0.008056640625 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019554790144481364, + "lm_loss": 0.005889892578125, + "loss": 0.0068, + "step": 2332, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019554411097808382, + "lm_loss": 0.002105712890625, + "loss": 0.0065, + "step": 2333, + "total_loss": 0.002105712890625 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019554031893522856, + "lm_loss": 0.0118408203125, + "loss": 0.0071, + "step": 2334, + "total_loss": 0.0118408203125 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019553652531631048, + "lm_loss": 0.0081787109375, + "loss": 0.0073, + "step": 2335, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019553273012139216, + "lm_loss": 0.00567626953125, + "loss": 0.0076, + "step": 2336, + "total_loss": 0.00567626953125 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019552893335053622, + "lm_loss": 0.0047607421875, + "loss": 0.0083, + "step": 2337, + "total_loss": 0.0047607421875 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019552513500380525, + "lm_loss": 0.00151824951171875, + "loss": 0.0085, + "step": 2338, + "total_loss": 0.00151824951171875 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001955213350812619, + "lm_loss": 0.01043701171875, + "loss": 0.0087, + "step": 2339, + "total_loss": 0.01043701171875 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019551753358296897, + "lm_loss": 0.0089111328125, + "loss": 0.0086, + "step": 2340, + "total_loss": 0.0089111328125 + }, + { + "epoch": 0.96, + "learning_rate": 0.000195513730508989, + "lm_loss": 0.01177978515625, + "loss": 0.0088, + "step": 2341, + "total_loss": 0.01177978515625 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019550992585938486, + "lm_loss": 0.01104736328125, + "loss": 0.0078, + "step": 2342, + "total_loss": 0.01104736328125 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019550611963421925, + "lm_loss": 0.007171630859375, + "loss": 0.0065, + "step": 2343, + "total_loss": 0.007171630859375 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019550231183355498, + "lm_loss": 0.0115966796875, + "loss": 0.0088, + "step": 2344, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019549850245745488, + "lm_loss": 0.01416015625, + "loss": 0.008, + "step": 2345, + "total_loss": 0.01416015625 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019549469150598173, + "lm_loss": 0.0029144287109375, + "loss": 0.0076, + "step": 2346, + "total_loss": 0.0029144287109375 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019549087897919844, + "lm_loss": 0.00885009765625, + "loss": 0.0093, + "step": 2347, + "total_loss": 0.00885009765625 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019548706487716792, + "lm_loss": 0.009765625, + "loss": 0.0098, + "step": 2348, + "total_loss": 0.009765625 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019548324919995306, + "lm_loss": 0.00628662109375, + "loss": 0.007, + "step": 2349, + "total_loss": 0.00628662109375 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019547943194761684, + "lm_loss": 0.00811767578125, + "loss": 0.0086, + "step": 2350, + "total_loss": 0.00811767578125 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019547561312022218, + "lm_loss": 0.013916015625, + "loss": 0.0093, + "step": 2351, + "total_loss": 0.013916015625 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001954717927178321, + "lm_loss": 0.0034637451171875, + "loss": 0.0092, + "step": 2352, + "total_loss": 0.0034637451171875 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001954679707405096, + "lm_loss": 0.019775390625, + "loss": 0.0099, + "step": 2353, + "total_loss": 0.019775390625 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019546414718831775, + "lm_loss": 0.004974365234375, + "loss": 0.0085, + "step": 2354, + "total_loss": 0.004974365234375 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019546032206131965, + "lm_loss": 0.00531005859375, + "loss": 0.0086, + "step": 2355, + "total_loss": 0.00531005859375 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019545649535957838, + "lm_loss": 0.0137939453125, + "loss": 0.0074, + "step": 2356, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019545266708315704, + "lm_loss": 0.00872802734375, + "loss": 0.0076, + "step": 2357, + "total_loss": 0.00872802734375 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019544883723211881, + "lm_loss": 0.0057373046875, + "loss": 0.0075, + "step": 2358, + "total_loss": 0.0057373046875 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019544500580652687, + "lm_loss": 0.01080322265625, + "loss": 0.0079, + "step": 2359, + "total_loss": 0.01080322265625 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019544117280644442, + "lm_loss": 0.00714111328125, + "loss": 0.0076, + "step": 2360, + "total_loss": 0.00714111328125 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001954373382319347, + "lm_loss": 0.0054931640625, + "loss": 0.0073, + "step": 2361, + "total_loss": 0.0054931640625 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019543350208306093, + "lm_loss": 0.006866455078125, + "loss": 0.0091, + "step": 2362, + "total_loss": 0.006866455078125 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019542966435988644, + "lm_loss": 0.00494384765625, + "loss": 0.0065, + "step": 2363, + "total_loss": 0.00494384765625 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001954258250624745, + "lm_loss": 0.00848388671875, + "loss": 0.0075, + "step": 2364, + "total_loss": 0.00848388671875 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019542198419088844, + "lm_loss": 0.01019287109375, + "loss": 0.0076, + "step": 2365, + "total_loss": 0.01019287109375 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019541814174519164, + "lm_loss": 0.004150390625, + "loss": 0.0065, + "step": 2366, + "total_loss": 0.004150390625 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019541429772544752, + "lm_loss": 0.0125732421875, + "loss": 0.0078, + "step": 2367, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019541045213171943, + "lm_loss": 0.0135498046875, + "loss": 0.0086, + "step": 2368, + "total_loss": 0.0135498046875 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019540660496407086, + "lm_loss": 0.005889892578125, + "loss": 0.0071, + "step": 2369, + "total_loss": 0.005889892578125 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019540275622256524, + "lm_loss": 0.007354736328125, + "loss": 0.0089, + "step": 2370, + "total_loss": 0.007354736328125 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019539890590726606, + "lm_loss": 0.00640869140625, + "loss": 0.0059, + "step": 2371, + "total_loss": 0.00640869140625 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019539505401823683, + "lm_loss": 0.0030059814453125, + "loss": 0.0082, + "step": 2372, + "total_loss": 0.0030059814453125 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019539120055554111, + "lm_loss": 0.005645751953125, + "loss": 0.0074, + "step": 2373, + "total_loss": 0.005645751953125 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001953873455192425, + "lm_loss": 0.006591796875, + "loss": 0.0067, + "step": 2374, + "total_loss": 0.006591796875 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019538348890940453, + "lm_loss": 0.00494384765625, + "loss": 0.0074, + "step": 2375, + "total_loss": 0.00494384765625 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019537963072609088, + "lm_loss": 0.0034332275390625, + "loss": 0.01, + "step": 2376, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019537577096936514, + "lm_loss": 0.00179290771484375, + "loss": 0.0065, + "step": 2377, + "total_loss": 0.00179290771484375 + }, + { + "epoch": 0.97, + "learning_rate": 0.000195371909639291, + "lm_loss": 0.006134033203125, + "loss": 0.0078, + "step": 2378, + "total_loss": 0.006134033203125 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001953680467359322, + "lm_loss": 0.0036163330078125, + "loss": 0.0053, + "step": 2379, + "total_loss": 0.0036163330078125 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019536418225935237, + "lm_loss": 0.0093994140625, + "loss": 0.0082, + "step": 2380, + "total_loss": 0.0093994140625 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019536031620961538, + "lm_loss": 0.006805419921875, + "loss": 0.008, + "step": 2381, + "total_loss": 0.006805419921875 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019535644858678487, + "lm_loss": 0.00921630859375, + "loss": 0.0069, + "step": 2382, + "total_loss": 0.00921630859375 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019535257939092476, + "lm_loss": 0.0125732421875, + "loss": 0.0103, + "step": 2383, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019534870862209885, + "lm_loss": 0.0078125, + "loss": 0.0074, + "step": 2384, + "total_loss": 0.0078125 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019534483628037098, + "lm_loss": 0.005950927734375, + "loss": 0.0076, + "step": 2385, + "total_loss": 0.005950927734375 + }, + { + "epoch": 0.98, + "learning_rate": 0.000195340962365805, + "lm_loss": 0.0068359375, + "loss": 0.0081, + "step": 2386, + "total_loss": 0.0068359375 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019533708687846483, + "lm_loss": 0.0037384033203125, + "loss": 0.0066, + "step": 2387, + "total_loss": 0.0037384033203125 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019533320981841441, + "lm_loss": 0.013916015625, + "loss": 0.0086, + "step": 2388, + "total_loss": 0.013916015625 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001953293311857177, + "lm_loss": 0.004180908203125, + "loss": 0.0072, + "step": 2389, + "total_loss": 0.004180908203125 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001953254509804387, + "lm_loss": 0.0033416748046875, + "loss": 0.0097, + "step": 2390, + "total_loss": 0.0033416748046875 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001953215692026414, + "lm_loss": 0.00927734375, + "loss": 0.0098, + "step": 2391, + "total_loss": 0.00927734375 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019531768585238983, + "lm_loss": 0.01025390625, + "loss": 0.0073, + "step": 2392, + "total_loss": 0.01025390625 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019531380092974808, + "lm_loss": 0.00274658203125, + "loss": 0.0083, + "step": 2393, + "total_loss": 0.00274658203125 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019530991443478019, + "lm_loss": 0.0098876953125, + "loss": 0.0095, + "step": 2394, + "total_loss": 0.0098876953125 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019530602636755028, + "lm_loss": 0.00482177734375, + "loss": 0.0083, + "step": 2395, + "total_loss": 0.00482177734375 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001953021367281225, + "lm_loss": 0.004180908203125, + "loss": 0.0072, + "step": 2396, + "total_loss": 0.004180908203125 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019529824551656107, + "lm_loss": 0.0048828125, + "loss": 0.0081, + "step": 2397, + "total_loss": 0.0048828125 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001952943527329301, + "lm_loss": 0.0081787109375, + "loss": 0.0076, + "step": 2398, + "total_loss": 0.0081787109375 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019529045837729384, + "lm_loss": 0.0167236328125, + "loss": 0.007, + "step": 2399, + "total_loss": 0.0167236328125 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019528656244971652, + "lm_loss": 0.008544921875, + "loss": 0.0101, + "step": 2400, + "total_loss": 0.008544921875 + }, + { + "epoch": 0.98, + "eval_lm_loss": 0.009607951156795025, + "eval_loss": 0.009962158277630806, + "eval_runtime": 43.878, + "eval_samples_per_second": 22.79, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009607951156795025, + "lm_loss": 0.0012359619140625, + "step": 2400, + "total_loss": 0.0012359619140625 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001952826649502624, + "lm_loss": 0.0103759765625, + "loss": 0.0084, + "step": 2401, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019527876587899584, + "lm_loss": 0.0034332275390625, + "loss": 0.0073, + "step": 2402, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019527486523598112, + "lm_loss": 0.005859375, + "loss": 0.0073, + "step": 2403, + "total_loss": 0.005859375 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019527096302128253, + "lm_loss": 0.005462646484375, + "loss": 0.0079, + "step": 2404, + "total_loss": 0.005462646484375 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019526705923496453, + "lm_loss": 0.0025177001953125, + "loss": 0.0099, + "step": 2405, + "total_loss": 0.0025177001953125 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019526315387709145, + "lm_loss": 0.00250244140625, + "loss": 0.0073, + "step": 2406, + "total_loss": 0.00250244140625 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019525924694772776, + "lm_loss": 0.0032196044921875, + "loss": 0.007, + "step": 2407, + "total_loss": 0.0032196044921875 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019525533844693787, + "lm_loss": 0.01239013671875, + "loss": 0.0089, + "step": 2408, + "total_loss": 0.01239013671875 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019525142837478632, + "lm_loss": 0.01336669921875, + "loss": 0.0101, + "step": 2409, + "total_loss": 0.01336669921875 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019524751673133755, + "lm_loss": 0.010986328125, + "loss": 0.0089, + "step": 2410, + "total_loss": 0.010986328125 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019524360351665608, + "lm_loss": 0.0062255859375, + "loss": 0.0095, + "step": 2411, + "total_loss": 0.0062255859375 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019523968873080654, + "lm_loss": 0.0103759765625, + "loss": 0.0077, + "step": 2412, + "total_loss": 0.0103759765625 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019523577237385344, + "lm_loss": 0.00579833984375, + "loss": 0.0079, + "step": 2413, + "total_loss": 0.00579833984375 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001952318544458614, + "lm_loss": 0.0137939453125, + "loss": 0.0091, + "step": 2414, + "total_loss": 0.0137939453125 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019522793494689507, + "lm_loss": 0.0152587890625, + "loss": 0.0079, + "step": 2415, + "total_loss": 0.0152587890625 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001952240138770191, + "lm_loss": 0.0023193359375, + "loss": 0.0084, + "step": 2416, + "total_loss": 0.0023193359375 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019522009123629816, + "lm_loss": 0.0120849609375, + "loss": 0.01, + "step": 2417, + "total_loss": 0.0120849609375 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019521616702479697, + "lm_loss": 0.0078125, + "loss": 0.0101, + "step": 2418, + "total_loss": 0.0078125 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019521224124258027, + "lm_loss": 0.00860595703125, + "loss": 0.0072, + "step": 2419, + "total_loss": 0.00860595703125 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001952083138897128, + "lm_loss": 0.0125732421875, + "loss": 0.0074, + "step": 2420, + "total_loss": 0.0125732421875 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019520438496625935, + "lm_loss": 0.002716064453125, + "loss": 0.0084, + "step": 2421, + "total_loss": 0.002716064453125 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019520045447228476, + "lm_loss": 0.00151824951171875, + "loss": 0.0095, + "step": 2422, + "total_loss": 0.00151824951171875 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019519652240785384, + "lm_loss": 0.005462646484375, + "loss": 0.007, + "step": 2423, + "total_loss": 0.005462646484375 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019519258877303148, + "lm_loss": 0.01220703125, + "loss": 0.0095, + "step": 2424, + "total_loss": 0.01220703125 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001951886535678826, + "lm_loss": 0.0072021484375, + "loss": 0.0072, + "step": 2425, + "total_loss": 0.0072021484375 + }, + { + "epoch": 0.99, + "learning_rate": 0.000195184716792472, + "lm_loss": 0.0067138671875, + "loss": 0.0074, + "step": 2426, + "total_loss": 0.0067138671875 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001951807784468647, + "lm_loss": 0.0115966796875, + "loss": 0.008, + "step": 2427, + "total_loss": 0.0115966796875 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019517683853112572, + "lm_loss": 0.007720947265625, + "loss": 0.0085, + "step": 2428, + "total_loss": 0.007720947265625 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019517289704531996, + "lm_loss": 0.01556396484375, + "loss": 0.0067, + "step": 2429, + "total_loss": 0.01556396484375 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019516895398951248, + "lm_loss": 0.01165771484375, + "loss": 0.0065, + "step": 2430, + "total_loss": 0.01165771484375 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019516500936376833, + "lm_loss": 0.01202392578125, + "loss": 0.0102, + "step": 2431, + "total_loss": 0.01202392578125 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001951610631681526, + "lm_loss": 0.009765625, + "loss": 0.0095, + "step": 2432, + "total_loss": 0.009765625 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019515711540273032, + "lm_loss": 0.003997802734375, + "loss": 0.0087, + "step": 2433, + "total_loss": 0.003997802734375 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019515316606756667, + "lm_loss": 0.00823974609375, + "loss": 0.0095, + "step": 2434, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019514921516272679, + "lm_loss": 0.006256103515625, + "loss": 0.0073, + "step": 2435, + "total_loss": 0.006256103515625 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019514526268827584, + "lm_loss": 0.005126953125, + "loss": 0.0086, + "step": 2436, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019514130864427907, + "lm_loss": 0.00677490234375, + "loss": 0.0076, + "step": 2437, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019513735303080165, + "lm_loss": 0.01129150390625, + "loss": 0.0073, + "step": 2438, + "total_loss": 0.01129150390625 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019513339584790884, + "lm_loss": 0.00592041015625, + "loss": 0.0083, + "step": 2439, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019512943709566594, + "lm_loss": 0.009765625, + "loss": 0.0092, + "step": 2440, + "total_loss": 0.009765625 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019512547677413825, + "lm_loss": 0.0048828125, + "loss": 0.0078, + "step": 2441, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001951215148833911, + "lm_loss": 0.00799560546875, + "loss": 0.0071, + "step": 2442, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019511755142348985, + "lm_loss": 0.0048828125, + "loss": 0.0076, + "step": 2443, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019511358639449987, + "lm_loss": 0.01165771484375, + "loss": 0.0088, + "step": 2444, + "total_loss": 0.01165771484375 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019510961979648655, + "lm_loss": 0.00787353515625, + "loss": 0.0081, + "step": 2445, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019510565162951537, + "lm_loss": 0.01141357421875, + "loss": 0.0102, + "step": 2446, + "total_loss": 0.01141357421875 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019510168189365177, + "lm_loss": 0.003997802734375, + "loss": 0.009, + "step": 2447, + "total_loss": 0.003997802734375 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001950977105889612, + "lm_loss": 0.005340576171875, + "loss": 0.0076, + "step": 2448, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019509373771550928, + "lm_loss": 0.0050048828125, + "loss": 0.0077, + "step": 2449, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019508976327336144, + "lm_loss": 0.00885009765625, + "loss": 0.0081, + "step": 2450, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019508578726258326, + "lm_loss": 0.004302978515625, + "loss": 0.0082, + "step": 2451, + "total_loss": 0.004302978515625 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019508180968324037, + "lm_loss": 0.01031494140625, + "loss": 0.0079, + "step": 2452, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019507783053539837, + "lm_loss": 0.01141357421875, + "loss": 0.0079, + "step": 2453, + "total_loss": 0.01141357421875 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019507384981912288, + "lm_loss": 0.00787353515625, + "loss": 0.0094, + "step": 2454, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019506986753447958, + "lm_loss": 0.0054931640625, + "loss": 0.0075, + "step": 2455, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001950658836815342, + "lm_loss": 0.0047607421875, + "loss": 0.0077, + "step": 2456, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019506189826035234, + "lm_loss": 0.00518798828125, + "loss": 0.0067, + "step": 2457, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001950579112709999, + "lm_loss": 0.007720947265625, + "loss": 0.0079, + "step": 2458, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019505392271354258, + "lm_loss": 0.00872802734375, + "loss": 0.0081, + "step": 2459, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001950499325880461, + "lm_loss": 0.002777099609375, + "loss": 0.0059, + "step": 2460, + "total_loss": 0.002777099609375 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019504594089457644, + "lm_loss": 0.0084228515625, + "loss": 0.0074, + "step": 2461, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019504194763319933, + "lm_loss": 0.006591796875, + "loss": 0.008, + "step": 2462, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001950379528039807, + "lm_loss": 0.00982666015625, + "loss": 0.0086, + "step": 2463, + "total_loss": 0.00982666015625 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019503395640698643, + "lm_loss": 0.00445556640625, + "loss": 0.0077, + "step": 2464, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019502995844228243, + "lm_loss": 0.0078125, + "loss": 0.0069, + "step": 2465, + "total_loss": 0.0078125 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019502595890993468, + "lm_loss": 0.00689697265625, + "loss": 0.0079, + "step": 2466, + "total_loss": 0.00689697265625 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019502195781000915, + "lm_loss": 0.00811767578125, + "loss": 0.0082, + "step": 2467, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001950179551425718, + "lm_loss": 0.013916015625, + "loss": 0.0083, + "step": 2468, + "total_loss": 0.013916015625 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019501395090768876, + "lm_loss": 0.007080078125, + "loss": 0.0106, + "step": 2469, + "total_loss": 0.007080078125 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019500994510542597, + "lm_loss": 0.005523681640625, + "loss": 0.0074, + "step": 2470, + "total_loss": 0.005523681640625 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019500593773584958, + "lm_loss": 0.0031585693359375, + "loss": 0.008, + "step": 2471, + "total_loss": 0.0031585693359375 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019500192879902569, + "lm_loss": 0.007720947265625, + "loss": 0.009, + "step": 2472, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001949979182950204, + "lm_loss": 0.005859375, + "loss": 0.0075, + "step": 2473, + "total_loss": 0.005859375 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001949939062238999, + "lm_loss": 0.009033203125, + "loss": 0.0077, + "step": 2474, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019498989258573038, + "lm_loss": 0.0101318359375, + "loss": 0.0075, + "step": 2475, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019498587738057803, + "lm_loss": 0.004486083984375, + "loss": 0.0109, + "step": 2476, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019498186060850912, + "lm_loss": 0.0096435546875, + "loss": 0.0073, + "step": 2477, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019497784226958986, + "lm_loss": 0.00396728515625, + "loss": 0.0085, + "step": 2478, + "total_loss": 0.00396728515625 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019497382236388655, + "lm_loss": 0.005340576171875, + "loss": 0.0064, + "step": 2479, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001949698008914655, + "lm_loss": 0.0113525390625, + "loss": 0.0072, + "step": 2480, + "total_loss": 0.0113525390625 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001949657778523931, + "lm_loss": 0.0057373046875, + "loss": 0.0072, + "step": 2481, + "total_loss": 0.0057373046875 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001949617532467357, + "lm_loss": 0.0034027099609375, + "loss": 0.0066, + "step": 2482, + "total_loss": 0.0034027099609375 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019495772707455963, + "lm_loss": 0.01141357421875, + "loss": 0.0082, + "step": 2483, + "total_loss": 0.01141357421875 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019495369933593138, + "lm_loss": 0.0108642578125, + "loss": 0.0071, + "step": 2484, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019494967003091734, + "lm_loss": 0.00994873046875, + "loss": 0.0086, + "step": 2485, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019494563915958398, + "lm_loss": 0.01416015625, + "loss": 0.0091, + "step": 2486, + "total_loss": 0.01416015625 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019494160672199783, + "lm_loss": 0.004730224609375, + "loss": 0.0085, + "step": 2487, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019493757271822545, + "lm_loss": 0.0079345703125, + "loss": 0.0083, + "step": 2488, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019493353714833327, + "lm_loss": 0.00897216796875, + "loss": 0.006, + "step": 2489, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019492950001238796, + "lm_loss": 0.0072021484375, + "loss": 0.0076, + "step": 2490, + "total_loss": 0.0072021484375 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019492546131045604, + "lm_loss": 0.00738525390625, + "loss": 0.0076, + "step": 2491, + "total_loss": 0.00738525390625 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001949214210426042, + "lm_loss": 0.01220703125, + "loss": 0.0074, + "step": 2492, + "total_loss": 0.01220703125 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019491737920889905, + "lm_loss": 0.0142822265625, + "loss": 0.0079, + "step": 2493, + "total_loss": 0.0142822265625 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019491333580940728, + "lm_loss": 0.004425048828125, + "loss": 0.0077, + "step": 2494, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019490929084419562, + "lm_loss": 0.00860595703125, + "loss": 0.0087, + "step": 2495, + "total_loss": 0.00860595703125 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019490524431333076, + "lm_loss": 0.004638671875, + "loss": 0.0076, + "step": 2496, + "total_loss": 0.004638671875 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019490119621687942, + "lm_loss": 0.0128173828125, + "loss": 0.0071, + "step": 2497, + "total_loss": 0.0128173828125 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019489714655490846, + "lm_loss": 0.003814697265625, + "loss": 0.006, + "step": 2498, + "total_loss": 0.003814697265625 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019489309532748464, + "lm_loss": 0.01397705078125, + "loss": 0.007, + "step": 2499, + "total_loss": 0.01397705078125 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019488904253467477, + "lm_loss": 0.00799560546875, + "loss": 0.0084, + "step": 2500, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.02, + "eval_lm_loss": 0.00966216903179884, + "eval_loss": 0.01002834364771843, + "eval_runtime": 43.908, + "eval_samples_per_second": 22.775, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.00966216903179884, + "lm_loss": 0.001007080078125, + "step": 2500, + "total_loss": 0.001007080078125 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019488498817654576, + "lm_loss": 0.00872802734375, + "loss": 0.0072, + "step": 2501, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019488093225316443, + "lm_loss": 0.007415771484375, + "loss": 0.0076, + "step": 2502, + "total_loss": 0.007415771484375 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019487687476459775, + "lm_loss": 0.00537109375, + "loss": 0.0076, + "step": 2503, + "total_loss": 0.00537109375 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019487281571091262, + "lm_loss": 0.01251220703125, + "loss": 0.0078, + "step": 2504, + "total_loss": 0.01251220703125 + }, + { + "epoch": 1.02, + "learning_rate": 0.000194868755092176, + "lm_loss": 0.005767822265625, + "loss": 0.0075, + "step": 2505, + "total_loss": 0.005767822265625 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019486469290845487, + "lm_loss": 0.01080322265625, + "loss": 0.008, + "step": 2506, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019486062915981625, + "lm_loss": 0.00162506103515625, + "loss": 0.0072, + "step": 2507, + "total_loss": 0.00162506103515625 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001948565638463272, + "lm_loss": 0.008056640625, + "loss": 0.0091, + "step": 2508, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019485249696805473, + "lm_loss": 0.0079345703125, + "loss": 0.0085, + "step": 2509, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019484842852506598, + "lm_loss": 0.007537841796875, + "loss": 0.0062, + "step": 2510, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.03, + "learning_rate": 0.000194844358517428, + "lm_loss": 0.00518798828125, + "loss": 0.0069, + "step": 2511, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.03, + "learning_rate": 0.000194840286945208, + "lm_loss": 0.007537841796875, + "loss": 0.0095, + "step": 2512, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001948362138084731, + "lm_loss": 0.010009765625, + "loss": 0.0091, + "step": 2513, + "total_loss": 0.010009765625 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019483213910729058, + "lm_loss": 0.0062255859375, + "loss": 0.0072, + "step": 2514, + "total_loss": 0.0062255859375 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019482806284172753, + "lm_loss": 0.00836181640625, + "loss": 0.007, + "step": 2515, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019482398501185122, + "lm_loss": 0.00543212890625, + "loss": 0.0082, + "step": 2516, + "total_loss": 0.00543212890625 + }, + { + "epoch": 1.03, + "learning_rate": 0.000194819905617729, + "lm_loss": 0.00457763671875, + "loss": 0.008, + "step": 2517, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001948158246594281, + "lm_loss": 0.006378173828125, + "loss": 0.0068, + "step": 2518, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019481174213701585, + "lm_loss": 0.0107421875, + "loss": 0.0086, + "step": 2519, + "total_loss": 0.0107421875 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001948076580505596, + "lm_loss": 0.0128173828125, + "loss": 0.0082, + "step": 2520, + "total_loss": 0.0128173828125 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019480357240012674, + "lm_loss": 0.00518798828125, + "loss": 0.0078, + "step": 2521, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019479948518578465, + "lm_loss": 0.004364013671875, + "loss": 0.0074, + "step": 2522, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019479539640760075, + "lm_loss": 0.004638671875, + "loss": 0.0075, + "step": 2523, + "total_loss": 0.004638671875 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019479130606564246, + "lm_loss": 0.013916015625, + "loss": 0.0103, + "step": 2524, + "total_loss": 0.013916015625 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019478721415997735, + "lm_loss": 0.01104736328125, + "loss": 0.0093, + "step": 2525, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001947831206906728, + "lm_loss": 0.0079345703125, + "loss": 0.0086, + "step": 2526, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019477902565779646, + "lm_loss": 0.0038604736328125, + "loss": 0.0063, + "step": 2527, + "total_loss": 0.0038604736328125 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019477492906141581, + "lm_loss": 0.00994873046875, + "loss": 0.0085, + "step": 2528, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019477083090159845, + "lm_loss": 0.00885009765625, + "loss": 0.0091, + "step": 2529, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019476673117841195, + "lm_loss": 0.005035400390625, + "loss": 0.0103, + "step": 2530, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019476262989192398, + "lm_loss": 0.0166015625, + "loss": 0.0073, + "step": 2531, + "total_loss": 0.0166015625 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019475852704220215, + "lm_loss": 0.006591796875, + "loss": 0.0067, + "step": 2532, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019475442262931422, + "lm_loss": 0.00176239013671875, + "loss": 0.0068, + "step": 2533, + "total_loss": 0.00176239013671875 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019475031665332784, + "lm_loss": 0.00341796875, + "loss": 0.0089, + "step": 2534, + "total_loss": 0.00341796875 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019474620911431076, + "lm_loss": 0.006317138671875, + "loss": 0.0072, + "step": 2535, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019474210001233072, + "lm_loss": 0.00592041015625, + "loss": 0.0087, + "step": 2536, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001947379893474555, + "lm_loss": 0.0067138671875, + "loss": 0.0076, + "step": 2537, + "total_loss": 0.0067138671875 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019473387711975296, + "lm_loss": 0.01031494140625, + "loss": 0.0103, + "step": 2538, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019472976332929093, + "lm_loss": 0.01544189453125, + "loss": 0.0078, + "step": 2539, + "total_loss": 0.01544189453125 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019472564797613721, + "lm_loss": 0.0087890625, + "loss": 0.0064, + "step": 2540, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019472153106035974, + "lm_loss": 0.01031494140625, + "loss": 0.0074, + "step": 2541, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019471741258202642, + "lm_loss": 0.006103515625, + "loss": 0.0071, + "step": 2542, + "total_loss": 0.006103515625 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001947132925412052, + "lm_loss": 0.01025390625, + "loss": 0.0079, + "step": 2543, + "total_loss": 0.01025390625 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019470917093796403, + "lm_loss": 0.0052490234375, + "loss": 0.0068, + "step": 2544, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001947050477723709, + "lm_loss": 0.0042724609375, + "loss": 0.0081, + "step": 2545, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019470092304449386, + "lm_loss": 0.00592041015625, + "loss": 0.0077, + "step": 2546, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001946967967544009, + "lm_loss": 0.0084228515625, + "loss": 0.0096, + "step": 2547, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019469266890216014, + "lm_loss": 0.00848388671875, + "loss": 0.0064, + "step": 2548, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019468853948783962, + "lm_loss": 0.006103515625, + "loss": 0.0094, + "step": 2549, + "total_loss": 0.006103515625 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019468440851150753, + "lm_loss": 0.0101318359375, + "loss": 0.0067, + "step": 2550, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019468027597323198, + "lm_loss": 0.0068359375, + "loss": 0.0081, + "step": 2551, + "total_loss": 0.0068359375 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001946761418730811, + "lm_loss": 0.0015869140625, + "loss": 0.0094, + "step": 2552, + "total_loss": 0.0015869140625 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019467200621112312, + "lm_loss": 0.008544921875, + "loss": 0.0072, + "step": 2553, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001946678689874263, + "lm_loss": 0.00677490234375, + "loss": 0.0066, + "step": 2554, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019466373020205887, + "lm_loss": 0.00860595703125, + "loss": 0.0088, + "step": 2555, + "total_loss": 0.00860595703125 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019465958985508906, + "lm_loss": 0.00653076171875, + "loss": 0.0089, + "step": 2556, + "total_loss": 0.00653076171875 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019465544794658522, + "lm_loss": 0.00787353515625, + "loss": 0.0091, + "step": 2557, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019465130447661566, + "lm_loss": 0.0101318359375, + "loss": 0.0095, + "step": 2558, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019464715944524872, + "lm_loss": 0.00555419921875, + "loss": 0.0085, + "step": 2559, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001946430128525528, + "lm_loss": 0.0029754638671875, + "loss": 0.0079, + "step": 2560, + "total_loss": 0.0029754638671875 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001946388646985963, + "lm_loss": 0.005157470703125, + "loss": 0.0094, + "step": 2561, + "total_loss": 0.005157470703125 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019463471498344762, + "lm_loss": 0.006683349609375, + "loss": 0.0071, + "step": 2562, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019463056370717523, + "lm_loss": 0.005828857421875, + "loss": 0.0069, + "step": 2563, + "total_loss": 0.005828857421875 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019462641086984763, + "lm_loss": 0.0108642578125, + "loss": 0.0074, + "step": 2564, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001946222564715333, + "lm_loss": 0.00482177734375, + "loss": 0.008, + "step": 2565, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019461810051230082, + "lm_loss": 0.01068115234375, + "loss": 0.0075, + "step": 2566, + "total_loss": 0.01068115234375 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019461394299221868, + "lm_loss": 0.005462646484375, + "loss": 0.0078, + "step": 2567, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019460978391135553, + "lm_loss": 0.00628662109375, + "loss": 0.0087, + "step": 2568, + "total_loss": 0.00628662109375 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019460562326977993, + "lm_loss": 0.0037994384765625, + "loss": 0.0077, + "step": 2569, + "total_loss": 0.0037994384765625 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019460146106756054, + "lm_loss": 0.0130615234375, + "loss": 0.0059, + "step": 2570, + "total_loss": 0.0130615234375 + }, + { + "epoch": 1.05, + "learning_rate": 0.000194597297304766, + "lm_loss": 0.0115966796875, + "loss": 0.0079, + "step": 2571, + "total_loss": 0.0115966796875 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019459313198146502, + "lm_loss": 0.00836181640625, + "loss": 0.0093, + "step": 2572, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001945889650977263, + "lm_loss": 0.010986328125, + "loss": 0.0077, + "step": 2573, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001945847966536186, + "lm_loss": 0.007781982421875, + "loss": 0.0097, + "step": 2574, + "total_loss": 0.007781982421875 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019458062664921063, + "lm_loss": 0.00665283203125, + "loss": 0.007, + "step": 2575, + "total_loss": 0.00665283203125 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019457645508457125, + "lm_loss": 0.006805419921875, + "loss": 0.0069, + "step": 2576, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019457228195976923, + "lm_loss": 0.0220947265625, + "loss": 0.0093, + "step": 2577, + "total_loss": 0.0220947265625 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019456810727487342, + "lm_loss": 0.01190185546875, + "loss": 0.0082, + "step": 2578, + "total_loss": 0.01190185546875 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001945639310299527, + "lm_loss": 0.006378173828125, + "loss": 0.0073, + "step": 2579, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019455975322507592, + "lm_loss": 0.004241943359375, + "loss": 0.0076, + "step": 2580, + "total_loss": 0.004241943359375 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019455557386031204, + "lm_loss": 0.005035400390625, + "loss": 0.0097, + "step": 2581, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019455139293573, + "lm_loss": 0.00396728515625, + "loss": 0.0063, + "step": 2582, + "total_loss": 0.00396728515625 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019454721045139873, + "lm_loss": 0.00958251953125, + "loss": 0.0099, + "step": 2583, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001945430264073873, + "lm_loss": 0.012939453125, + "loss": 0.0077, + "step": 2584, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019453884080376468, + "lm_loss": 0.004608154296875, + "loss": 0.0081, + "step": 2585, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019453465364059996, + "lm_loss": 0.0091552734375, + "loss": 0.0096, + "step": 2586, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019453046491796214, + "lm_loss": 0.00726318359375, + "loss": 0.0074, + "step": 2587, + "total_loss": 0.00726318359375 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019452627463592036, + "lm_loss": 0.007293701171875, + "loss": 0.0077, + "step": 2588, + "total_loss": 0.007293701171875 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019452208279454374, + "lm_loss": 0.004791259765625, + "loss": 0.0077, + "step": 2589, + "total_loss": 0.004791259765625 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019451788939390144, + "lm_loss": 0.005950927734375, + "loss": 0.0083, + "step": 2590, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019451369443406263, + "lm_loss": 0.01025390625, + "loss": 0.0075, + "step": 2591, + "total_loss": 0.01025390625 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001945094979150965, + "lm_loss": 0.01007080078125, + "loss": 0.0087, + "step": 2592, + "total_loss": 0.01007080078125 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019450529983707228, + "lm_loss": 0.01031494140625, + "loss": 0.0072, + "step": 2593, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019450110020005924, + "lm_loss": 0.004852294921875, + "loss": 0.0116, + "step": 2594, + "total_loss": 0.004852294921875 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019449689900412665, + "lm_loss": 0.0030670166015625, + "loss": 0.0073, + "step": 2595, + "total_loss": 0.0030670166015625 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019449269624934382, + "lm_loss": 0.005279541015625, + "loss": 0.0066, + "step": 2596, + "total_loss": 0.005279541015625 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019448849193578007, + "lm_loss": 0.005584716796875, + "loss": 0.0073, + "step": 2597, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019448428606350476, + "lm_loss": 0.00579833984375, + "loss": 0.0086, + "step": 2598, + "total_loss": 0.00579833984375 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019448007863258723, + "lm_loss": 0.00634765625, + "loss": 0.0066, + "step": 2599, + "total_loss": 0.00634765625 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019447586964309695, + "lm_loss": 0.00933837890625, + "loss": 0.0093, + "step": 2600, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.06, + "eval_lm_loss": 0.009916252456605434, + "eval_loss": 0.010305652394890785, + "eval_runtime": 43.8981, + "eval_samples_per_second": 22.78, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009916252456605434, + "lm_loss": 0.0018463134765625, + "step": 2600, + "total_loss": 0.0018463134765625 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019447165909510335, + "lm_loss": 0.0120849609375, + "loss": 0.0085, + "step": 2601, + "total_loss": 0.0120849609375 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019446744698867587, + "lm_loss": 0.0057373046875, + "loss": 0.0065, + "step": 2602, + "total_loss": 0.0057373046875 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019446323332388397, + "lm_loss": 0.004913330078125, + "loss": 0.0087, + "step": 2603, + "total_loss": 0.004913330078125 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001944590181007972, + "lm_loss": 0.00628662109375, + "loss": 0.009, + "step": 2604, + "total_loss": 0.00628662109375 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019445480131948504, + "lm_loss": 0.01123046875, + "loss": 0.0071, + "step": 2605, + "total_loss": 0.01123046875 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019445058298001714, + "lm_loss": 0.0089111328125, + "loss": 0.008, + "step": 2606, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019444636308246297, + "lm_loss": 0.006744384765625, + "loss": 0.0115, + "step": 2607, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019444214162689228, + "lm_loss": 0.006195068359375, + "loss": 0.0087, + "step": 2608, + "total_loss": 0.006195068359375 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001944379186133746, + "lm_loss": 0.005828857421875, + "loss": 0.0075, + "step": 2609, + "total_loss": 0.005828857421875 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001944336940419796, + "lm_loss": 0.0115966796875, + "loss": 0.0101, + "step": 2610, + "total_loss": 0.0115966796875 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019442946791277708, + "lm_loss": 0.008056640625, + "loss": 0.0071, + "step": 2611, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019442524022583661, + "lm_loss": 0.00885009765625, + "loss": 0.009, + "step": 2612, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.07, + "learning_rate": 0.000194421010981228, + "lm_loss": 0.00811767578125, + "loss": 0.007, + "step": 2613, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019441678017902104, + "lm_loss": 0.012939453125, + "loss": 0.0104, + "step": 2614, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001944125478192855, + "lm_loss": 0.00640869140625, + "loss": 0.008, + "step": 2615, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019440831390209117, + "lm_loss": 0.0093994140625, + "loss": 0.0107, + "step": 2616, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019440407842750794, + "lm_loss": 0.005706787109375, + "loss": 0.0073, + "step": 2617, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001943998413956057, + "lm_loss": 0.0093994140625, + "loss": 0.0087, + "step": 2618, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019439560280645423, + "lm_loss": 0.0076904296875, + "loss": 0.0083, + "step": 2619, + "total_loss": 0.0076904296875 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019439136266012353, + "lm_loss": 0.0050048828125, + "loss": 0.0074, + "step": 2620, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019438712095668357, + "lm_loss": 0.005096435546875, + "loss": 0.0081, + "step": 2621, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001943828776962043, + "lm_loss": 0.0098876953125, + "loss": 0.0085, + "step": 2622, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001943786328787557, + "lm_loss": 0.00640869140625, + "loss": 0.0093, + "step": 2623, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019437438650440783, + "lm_loss": 0.01141357421875, + "loss": 0.0086, + "step": 2624, + "total_loss": 0.01141357421875 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001943701385732307, + "lm_loss": 0.0159912109375, + "loss": 0.0083, + "step": 2625, + "total_loss": 0.0159912109375 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019436588908529437, + "lm_loss": 0.01336669921875, + "loss": 0.0065, + "step": 2626, + "total_loss": 0.01336669921875 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019436163804066904, + "lm_loss": 0.005035400390625, + "loss": 0.0077, + "step": 2627, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019435738543942472, + "lm_loss": 0.0103759765625, + "loss": 0.0078, + "step": 2628, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019435313128163162, + "lm_loss": 0.013671875, + "loss": 0.0091, + "step": 2629, + "total_loss": 0.013671875 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001943488755673599, + "lm_loss": 0.006622314453125, + "loss": 0.0072, + "step": 2630, + "total_loss": 0.006622314453125 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019434461829667977, + "lm_loss": 0.0048828125, + "loss": 0.0067, + "step": 2631, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019434035946966146, + "lm_loss": 0.0145263671875, + "loss": 0.008, + "step": 2632, + "total_loss": 0.0145263671875 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019433609908637526, + "lm_loss": 0.0159912109375, + "loss": 0.0087, + "step": 2633, + "total_loss": 0.0159912109375 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001943318371468914, + "lm_loss": 0.01068115234375, + "loss": 0.0084, + "step": 2634, + "total_loss": 0.01068115234375 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019432757365128023, + "lm_loss": 0.003936767578125, + "loss": 0.0065, + "step": 2635, + "total_loss": 0.003936767578125 + }, + { + "epoch": 1.08, + "learning_rate": 0.000194323308599612, + "lm_loss": 0.0135498046875, + "loss": 0.0066, + "step": 2636, + "total_loss": 0.0135498046875 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019431904199195717, + "lm_loss": 0.0040283203125, + "loss": 0.0082, + "step": 2637, + "total_loss": 0.0040283203125 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019431477382838606, + "lm_loss": 0.010986328125, + "loss": 0.0078, + "step": 2638, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.08, + "learning_rate": 0.0001943105041089691, + "lm_loss": 0.004302978515625, + "loss": 0.0077, + "step": 2639, + "total_loss": 0.004302978515625 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019430623283377672, + "lm_loss": 0.0086669921875, + "loss": 0.0062, + "step": 2640, + "total_loss": 0.0086669921875 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019430196000287935, + "lm_loss": 0.00555419921875, + "loss": 0.0084, + "step": 2641, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019429768561634754, + "lm_loss": 0.005706787109375, + "loss": 0.0079, + "step": 2642, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019429340967425175, + "lm_loss": 0.00714111328125, + "loss": 0.008, + "step": 2643, + "total_loss": 0.00714111328125 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019428913217666257, + "lm_loss": 0.01171875, + "loss": 0.0097, + "step": 2644, + "total_loss": 0.01171875 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019428485312365047, + "lm_loss": 0.0223388671875, + "loss": 0.0091, + "step": 2645, + "total_loss": 0.0223388671875 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019428057251528615, + "lm_loss": 0.01409912109375, + "loss": 0.0082, + "step": 2646, + "total_loss": 0.01409912109375 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019427629035164015, + "lm_loss": 0.01263427734375, + "loss": 0.0092, + "step": 2647, + "total_loss": 0.01263427734375 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019427200663278314, + "lm_loss": 0.01495361328125, + "loss": 0.0078, + "step": 2648, + "total_loss": 0.01495361328125 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019426772135878575, + "lm_loss": 0.00872802734375, + "loss": 0.0072, + "step": 2649, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019426343452971873, + "lm_loss": 0.01031494140625, + "loss": 0.0076, + "step": 2650, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019425914614565275, + "lm_loss": 0.0033721923828125, + "loss": 0.0062, + "step": 2651, + "total_loss": 0.0033721923828125 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019425485620665857, + "lm_loss": 0.006378173828125, + "loss": 0.0089, + "step": 2652, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019425056471280696, + "lm_loss": 0.0101318359375, + "loss": 0.0068, + "step": 2653, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001942462716641687, + "lm_loss": 0.01226806640625, + "loss": 0.0095, + "step": 2654, + "total_loss": 0.01226806640625 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001942419770608146, + "lm_loss": 0.005767822265625, + "loss": 0.0062, + "step": 2655, + "total_loss": 0.005767822265625 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019423768090281557, + "lm_loss": 0.00860595703125, + "loss": 0.0077, + "step": 2656, + "total_loss": 0.00860595703125 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001942333831902424, + "lm_loss": 0.0027923583984375, + "loss": 0.0076, + "step": 2657, + "total_loss": 0.0027923583984375 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019422908392316602, + "lm_loss": 0.00555419921875, + "loss": 0.0078, + "step": 2658, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001942247831016574, + "lm_loss": 0.0167236328125, + "loss": 0.0091, + "step": 2659, + "total_loss": 0.0167236328125 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019422048072578738, + "lm_loss": 0.01123046875, + "loss": 0.0086, + "step": 2660, + "total_loss": 0.01123046875 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019421617679562703, + "lm_loss": 0.005096435546875, + "loss": 0.008, + "step": 2661, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001942118713112473, + "lm_loss": 0.004608154296875, + "loss": 0.0093, + "step": 2662, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019420756427271921, + "lm_loss": 0.0054931640625, + "loss": 0.0062, + "step": 2663, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019420325568011385, + "lm_loss": 0.004974365234375, + "loss": 0.0059, + "step": 2664, + "total_loss": 0.004974365234375 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019419894553350227, + "lm_loss": 0.005706787109375, + "loss": 0.0083, + "step": 2665, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019419463383295557, + "lm_loss": 0.00946044921875, + "loss": 0.0083, + "step": 2666, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019419032057854492, + "lm_loss": 0.007293701171875, + "loss": 0.0072, + "step": 2667, + "total_loss": 0.007293701171875 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019418600577034137, + "lm_loss": 0.003814697265625, + "loss": 0.0082, + "step": 2668, + "total_loss": 0.003814697265625 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001941816894084162, + "lm_loss": 0.006988525390625, + "loss": 0.0076, + "step": 2669, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019417737149284058, + "lm_loss": 0.0084228515625, + "loss": 0.008, + "step": 2670, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019417305202368575, + "lm_loss": 0.00885009765625, + "loss": 0.0075, + "step": 2671, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001941687310010229, + "lm_loss": 0.01361083984375, + "loss": 0.0084, + "step": 2672, + "total_loss": 0.01361083984375 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019416440842492342, + "lm_loss": 0.00872802734375, + "loss": 0.0098, + "step": 2673, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019416008429545852, + "lm_loss": 0.00970458984375, + "loss": 0.0096, + "step": 2674, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001941557586126996, + "lm_loss": 0.00286865234375, + "loss": 0.008, + "step": 2675, + "total_loss": 0.00286865234375 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019415143137671798, + "lm_loss": 0.00555419921875, + "loss": 0.0077, + "step": 2676, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019414710258758506, + "lm_loss": 0.010986328125, + "loss": 0.0093, + "step": 2677, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019414277224537224, + "lm_loss": 0.004058837890625, + "loss": 0.0088, + "step": 2678, + "total_loss": 0.004058837890625 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019413844035015093, + "lm_loss": 0.0096435546875, + "loss": 0.0087, + "step": 2679, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019413410690199265, + "lm_loss": 0.0023956298828125, + "loss": 0.0076, + "step": 2680, + "total_loss": 0.0023956298828125 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019412977190096884, + "lm_loss": 0.01226806640625, + "loss": 0.0083, + "step": 2681, + "total_loss": 0.01226806640625 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019412543534715103, + "lm_loss": 0.01129150390625, + "loss": 0.0101, + "step": 2682, + "total_loss": 0.01129150390625 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019412109724061075, + "lm_loss": 0.00921630859375, + "loss": 0.0075, + "step": 2683, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019411675758141956, + "lm_loss": 0.005126953125, + "loss": 0.0066, + "step": 2684, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019411241636964907, + "lm_loss": 0.0079345703125, + "loss": 0.0076, + "step": 2685, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019410807360537087, + "lm_loss": 0.004608154296875, + "loss": 0.0059, + "step": 2686, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001941037292886566, + "lm_loss": 0.0157470703125, + "loss": 0.0073, + "step": 2687, + "total_loss": 0.0157470703125 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019409938341957795, + "lm_loss": 0.0079345703125, + "loss": 0.0088, + "step": 2688, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019409503599820656, + "lm_loss": 0.0089111328125, + "loss": 0.0073, + "step": 2689, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019409068702461418, + "lm_loss": 0.00262451171875, + "loss": 0.0065, + "step": 2690, + "total_loss": 0.00262451171875 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001940863364988726, + "lm_loss": 0.0035400390625, + "loss": 0.0065, + "step": 2691, + "total_loss": 0.0035400390625 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001940819844210535, + "lm_loss": 0.00836181640625, + "loss": 0.0079, + "step": 2692, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001940776307912287, + "lm_loss": 0.003997802734375, + "loss": 0.0085, + "step": 2693, + "total_loss": 0.003997802734375 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019407327560947007, + "lm_loss": 0.011474609375, + "loss": 0.0088, + "step": 2694, + "total_loss": 0.011474609375 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001940689188758494, + "lm_loss": 0.0037384033203125, + "loss": 0.0071, + "step": 2695, + "total_loss": 0.0037384033203125 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019406456059043858, + "lm_loss": 0.006378173828125, + "loss": 0.0059, + "step": 2696, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019406020075330948, + "lm_loss": 0.00689697265625, + "loss": 0.0061, + "step": 2697, + "total_loss": 0.00689697265625 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019405583936453407, + "lm_loss": 0.006805419921875, + "loss": 0.0067, + "step": 2698, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019405147642418425, + "lm_loss": 0.0107421875, + "loss": 0.0074, + "step": 2699, + "total_loss": 0.0107421875 + }, + { + "epoch": 1.1, + "learning_rate": 0.000194047111932332, + "lm_loss": 0.0123291015625, + "loss": 0.0082, + "step": 2700, + "total_loss": 0.0123291015625 + }, + { + "epoch": 1.1, + "eval_lm_loss": 0.00977067556232214, + "eval_loss": 0.010191231034696102, + "eval_runtime": 44.263, + "eval_samples_per_second": 22.592, + "eval_steps_per_second": 0.203, + "eval_total_loss": 0.00977067556232214, + "lm_loss": 0.000732421875, + "step": 2700, + "total_loss": 0.000732421875 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019404274588904935, + "lm_loss": 0.010498046875, + "loss": 0.0095, + "step": 2701, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001940383782944083, + "lm_loss": 0.01141357421875, + "loss": 0.0092, + "step": 2702, + "total_loss": 0.01141357421875 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001940340091484809, + "lm_loss": 0.01141357421875, + "loss": 0.0077, + "step": 2703, + "total_loss": 0.01141357421875 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019402963845133924, + "lm_loss": 0.00762939453125, + "loss": 0.0073, + "step": 2704, + "total_loss": 0.00762939453125 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019402526620305535, + "lm_loss": 0.00909423828125, + "loss": 0.0098, + "step": 2705, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019402089240370146, + "lm_loss": 0.005706787109375, + "loss": 0.0084, + "step": 2706, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019401651705334971, + "lm_loss": 0.00921630859375, + "loss": 0.0085, + "step": 2707, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019401214015207218, + "lm_loss": 0.007598876953125, + "loss": 0.0068, + "step": 2708, + "total_loss": 0.007598876953125 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019400776169994116, + "lm_loss": 0.004791259765625, + "loss": 0.0086, + "step": 2709, + "total_loss": 0.004791259765625 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019400338169702885, + "lm_loss": 0.0057373046875, + "loss": 0.0072, + "step": 2710, + "total_loss": 0.0057373046875 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019399900014340754, + "lm_loss": 0.00909423828125, + "loss": 0.0088, + "step": 2711, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019399461703914942, + "lm_loss": 0.00872802734375, + "loss": 0.0061, + "step": 2712, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019399023238432687, + "lm_loss": 0.00787353515625, + "loss": 0.0087, + "step": 2713, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019398584617901223, + "lm_loss": 0.004974365234375, + "loss": 0.0069, + "step": 2714, + "total_loss": 0.004974365234375 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001939814584232778, + "lm_loss": 0.01171875, + "loss": 0.0078, + "step": 2715, + "total_loss": 0.01171875 + }, + { + "epoch": 1.11, + "learning_rate": 0.000193977069117196, + "lm_loss": 0.01092529296875, + "loss": 0.0084, + "step": 2716, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019397267826083922, + "lm_loss": 0.0069580078125, + "loss": 0.0089, + "step": 2717, + "total_loss": 0.0069580078125 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019396828585427993, + "lm_loss": 0.00543212890625, + "loss": 0.0071, + "step": 2718, + "total_loss": 0.00543212890625 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019396389189759052, + "lm_loss": 0.00958251953125, + "loss": 0.0081, + "step": 2719, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019395949639084355, + "lm_loss": 0.0111083984375, + "loss": 0.0076, + "step": 2720, + "total_loss": 0.0111083984375 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019395509933411146, + "lm_loss": 0.0157470703125, + "loss": 0.0072, + "step": 2721, + "total_loss": 0.0157470703125 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019395070072746683, + "lm_loss": 0.004302978515625, + "loss": 0.008, + "step": 2722, + "total_loss": 0.004302978515625 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001939463005709822, + "lm_loss": 0.00677490234375, + "loss": 0.0082, + "step": 2723, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019394189886473017, + "lm_loss": 0.00836181640625, + "loss": 0.0082, + "step": 2724, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019393749560878338, + "lm_loss": 0.0047607421875, + "loss": 0.0072, + "step": 2725, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019393309080321437, + "lm_loss": 0.00567626953125, + "loss": 0.0086, + "step": 2726, + "total_loss": 0.00567626953125 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019392868444809591, + "lm_loss": 0.0029296875, + "loss": 0.0068, + "step": 2727, + "total_loss": 0.0029296875 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019392427654350066, + "lm_loss": 0.005126953125, + "loss": 0.0072, + "step": 2728, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001939198670895013, + "lm_loss": 0.003265380859375, + "loss": 0.0071, + "step": 2729, + "total_loss": 0.003265380859375 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019391545608617058, + "lm_loss": 0.00506591796875, + "loss": 0.0076, + "step": 2730, + "total_loss": 0.00506591796875 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019391104353358126, + "lm_loss": 0.00872802734375, + "loss": 0.0089, + "step": 2731, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001939066294318062, + "lm_loss": 0.015380859375, + "loss": 0.0095, + "step": 2732, + "total_loss": 0.015380859375 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019390221378091813, + "lm_loss": 0.0033111572265625, + "loss": 0.007, + "step": 2733, + "total_loss": 0.0033111572265625 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019389779658098992, + "lm_loss": 0.003814697265625, + "loss": 0.008, + "step": 2734, + "total_loss": 0.003814697265625 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019389337783209446, + "lm_loss": 0.0107421875, + "loss": 0.0074, + "step": 2735, + "total_loss": 0.0107421875 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019388895753430464, + "lm_loss": 0.0087890625, + "loss": 0.0081, + "step": 2736, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001938845356876933, + "lm_loss": 0.006805419921875, + "loss": 0.0088, + "step": 2737, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001938801122923335, + "lm_loss": 0.0072021484375, + "loss": 0.0076, + "step": 2738, + "total_loss": 0.0072021484375 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019387568734829813, + "lm_loss": 0.0091552734375, + "loss": 0.0067, + "step": 2739, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019387126085566023, + "lm_loss": 0.005340576171875, + "loss": 0.0078, + "step": 2740, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001938668328144928, + "lm_loss": 0.01025390625, + "loss": 0.0101, + "step": 2741, + "total_loss": 0.01025390625 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001938624032248689, + "lm_loss": 0.01287841796875, + "loss": 0.0085, + "step": 2742, + "total_loss": 0.01287841796875 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019385797208686158, + "lm_loss": 0.00653076171875, + "loss": 0.0084, + "step": 2743, + "total_loss": 0.00653076171875 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019385353940054397, + "lm_loss": 0.0093994140625, + "loss": 0.0058, + "step": 2744, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019384910516598915, + "lm_loss": 0.0096435546875, + "loss": 0.0076, + "step": 2745, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019384466938327029, + "lm_loss": 0.00982666015625, + "loss": 0.0112, + "step": 2746, + "total_loss": 0.00982666015625 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019384023205246055, + "lm_loss": 0.0113525390625, + "loss": 0.0073, + "step": 2747, + "total_loss": 0.0113525390625 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001938357931736332, + "lm_loss": 0.0050048828125, + "loss": 0.007, + "step": 2748, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019383135274686134, + "lm_loss": 0.01324462890625, + "loss": 0.0081, + "step": 2749, + "total_loss": 0.01324462890625 + }, + { + "epoch": 1.12, + "learning_rate": 0.00019382691077221833, + "lm_loss": 0.00836181640625, + "loss": 0.0088, + "step": 2750, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001938224672497774, + "lm_loss": 0.00848388671875, + "loss": 0.0071, + "step": 2751, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019381802217961186, + "lm_loss": 0.01080322265625, + "loss": 0.0087, + "step": 2752, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019381357556179504, + "lm_loss": 0.00433349609375, + "loss": 0.0076, + "step": 2753, + "total_loss": 0.00433349609375 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019380912739640026, + "lm_loss": 0.005126953125, + "loss": 0.0065, + "step": 2754, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019380467768350098, + "lm_loss": 0.00982666015625, + "loss": 0.0059, + "step": 2755, + "total_loss": 0.00982666015625 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019380022642317052, + "lm_loss": 0.01611328125, + "loss": 0.0093, + "step": 2756, + "total_loss": 0.01611328125 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019379577361548233, + "lm_loss": 0.0164794921875, + "loss": 0.0088, + "step": 2757, + "total_loss": 0.0164794921875 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019379131926050986, + "lm_loss": 0.01239013671875, + "loss": 0.0074, + "step": 2758, + "total_loss": 0.01239013671875 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019378686335832663, + "lm_loss": 0.004180908203125, + "loss": 0.006, + "step": 2759, + "total_loss": 0.004180908203125 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019378240590900615, + "lm_loss": 0.006072998046875, + "loss": 0.0077, + "step": 2760, + "total_loss": 0.006072998046875 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019377794691262187, + "lm_loss": 0.00897216796875, + "loss": 0.0074, + "step": 2761, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019377348636924742, + "lm_loss": 0.011474609375, + "loss": 0.0073, + "step": 2762, + "total_loss": 0.011474609375 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019376902427895636, + "lm_loss": 0.00408935546875, + "loss": 0.0073, + "step": 2763, + "total_loss": 0.00408935546875 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019376456064182232, + "lm_loss": 0.0062255859375, + "loss": 0.008, + "step": 2764, + "total_loss": 0.0062255859375 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001937600954579189, + "lm_loss": 0.01348876953125, + "loss": 0.0084, + "step": 2765, + "total_loss": 0.01348876953125 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019375562872731976, + "lm_loss": 0.0076904296875, + "loss": 0.0073, + "step": 2766, + "total_loss": 0.0076904296875 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001937511604500986, + "lm_loss": 0.0147705078125, + "loss": 0.0107, + "step": 2767, + "total_loss": 0.0147705078125 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019374669062632914, + "lm_loss": 0.01104736328125, + "loss": 0.0088, + "step": 2768, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019374221925608508, + "lm_loss": 0.0038299560546875, + "loss": 0.0082, + "step": 2769, + "total_loss": 0.0038299560546875 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001937377463394402, + "lm_loss": 0.003936767578125, + "loss": 0.0061, + "step": 2770, + "total_loss": 0.003936767578125 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019373327187646836, + "lm_loss": 0.006683349609375, + "loss": 0.0065, + "step": 2771, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019372879586724323, + "lm_loss": 0.00994873046875, + "loss": 0.008, + "step": 2772, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019372431831183874, + "lm_loss": 0.0054931640625, + "loss": 0.0067, + "step": 2773, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019371983921032872, + "lm_loss": 0.0054931640625, + "loss": 0.0085, + "step": 2774, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001937153585627871, + "lm_loss": 0.009033203125, + "loss": 0.0088, + "step": 2775, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.13, + "learning_rate": 0.00019371087636928778, + "lm_loss": 0.0069580078125, + "loss": 0.0086, + "step": 2776, + "total_loss": 0.0069580078125 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019370639262990464, + "lm_loss": 0.0224609375, + "loss": 0.0084, + "step": 2777, + "total_loss": 0.0224609375 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019370190734471172, + "lm_loss": 0.005950927734375, + "loss": 0.0082, + "step": 2778, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019369742051378297, + "lm_loss": 0.009521484375, + "loss": 0.0065, + "step": 2779, + "total_loss": 0.009521484375 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001936929321371924, + "lm_loss": 0.00640869140625, + "loss": 0.0076, + "step": 2780, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001936884422150141, + "lm_loss": 0.0035247802734375, + "loss": 0.0086, + "step": 2781, + "total_loss": 0.0035247802734375 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019368395074732212, + "lm_loss": 0.006134033203125, + "loss": 0.0066, + "step": 2782, + "total_loss": 0.006134033203125 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001936794577341905, + "lm_loss": 0.006378173828125, + "loss": 0.0123, + "step": 2783, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019367496317569343, + "lm_loss": 0.006134033203125, + "loss": 0.0075, + "step": 2784, + "total_loss": 0.006134033203125 + }, + { + "epoch": 1.14, + "learning_rate": 0.000193670467071905, + "lm_loss": 0.007293701171875, + "loss": 0.0072, + "step": 2785, + "total_loss": 0.007293701171875 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019366596942289941, + "lm_loss": 0.006378173828125, + "loss": 0.0088, + "step": 2786, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019366147022875082, + "lm_loss": 0.00592041015625, + "loss": 0.0073, + "step": 2787, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019365696948953352, + "lm_loss": 0.01385498046875, + "loss": 0.0082, + "step": 2788, + "total_loss": 0.01385498046875 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019365246720532168, + "lm_loss": 0.013671875, + "loss": 0.0083, + "step": 2789, + "total_loss": 0.013671875 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001936479633761896, + "lm_loss": 0.00830078125, + "loss": 0.008, + "step": 2790, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019364345800221158, + "lm_loss": 0.002288818359375, + "loss": 0.0089, + "step": 2791, + "total_loss": 0.002288818359375 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019363895108346198, + "lm_loss": 0.00909423828125, + "loss": 0.0079, + "step": 2792, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019363444262001506, + "lm_loss": 0.007171630859375, + "loss": 0.0069, + "step": 2793, + "total_loss": 0.007171630859375 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019362993261194526, + "lm_loss": 0.004364013671875, + "loss": 0.0073, + "step": 2794, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019362542105932695, + "lm_loss": 0.006805419921875, + "loss": 0.0075, + "step": 2795, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019362090796223457, + "lm_loss": 0.00933837890625, + "loss": 0.0098, + "step": 2796, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019361639332074256, + "lm_loss": 0.00799560546875, + "loss": 0.0081, + "step": 2797, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001936118771349254, + "lm_loss": 0.0146484375, + "loss": 0.009, + "step": 2798, + "total_loss": 0.0146484375 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019360735940485762, + "lm_loss": 0.01165771484375, + "loss": 0.0074, + "step": 2799, + "total_loss": 0.01165771484375 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019360284013061367, + "lm_loss": 0.006866455078125, + "loss": 0.0075, + "step": 2800, + "total_loss": 0.006866455078125 + }, + { + "epoch": 1.14, + "eval_lm_loss": 0.009459610097110271, + "eval_loss": 0.009879998862743378, + "eval_runtime": 44.0219, + "eval_samples_per_second": 22.716, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.009459610097110271, + "lm_loss": 0.00095367431640625, + "step": 2800, + "total_loss": 0.00095367431640625 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019359831931226815, + "lm_loss": 0.00787353515625, + "loss": 0.0086, + "step": 2801, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019359379694989563, + "lm_loss": 0.007232666015625, + "loss": 0.0095, + "step": 2802, + "total_loss": 0.007232666015625 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019358927304357072, + "lm_loss": 0.01177978515625, + "loss": 0.01, + "step": 2803, + "total_loss": 0.01177978515625 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019358474759336805, + "lm_loss": 0.0062255859375, + "loss": 0.0073, + "step": 2804, + "total_loss": 0.0062255859375 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019358022059936226, + "lm_loss": 0.00946044921875, + "loss": 0.0072, + "step": 2805, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019357569206162803, + "lm_loss": 0.006622314453125, + "loss": 0.0079, + "step": 2806, + "total_loss": 0.006622314453125 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019357116198024011, + "lm_loss": 0.0033111572265625, + "loss": 0.0074, + "step": 2807, + "total_loss": 0.0033111572265625 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019356663035527313, + "lm_loss": 0.0133056640625, + "loss": 0.0081, + "step": 2808, + "total_loss": 0.0133056640625 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019356209718680192, + "lm_loss": 0.006683349609375, + "loss": 0.0068, + "step": 2809, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019355756247490125, + "lm_loss": 0.00958251953125, + "loss": 0.0072, + "step": 2810, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019355302621964593, + "lm_loss": 0.005340576171875, + "loss": 0.0066, + "step": 2811, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019354848842111078, + "lm_loss": 0.010986328125, + "loss": 0.0088, + "step": 2812, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019354394907937064, + "lm_loss": 0.00714111328125, + "loss": 0.0087, + "step": 2813, + "total_loss": 0.00714111328125 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019353940819450042, + "lm_loss": 0.0034942626953125, + "loss": 0.0081, + "step": 2814, + "total_loss": 0.0034942626953125 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019353486576657503, + "lm_loss": 0.005706787109375, + "loss": 0.0077, + "step": 2815, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019353032179566937, + "lm_loss": 0.0164794921875, + "loss": 0.0078, + "step": 2816, + "total_loss": 0.0164794921875 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019352577628185843, + "lm_loss": 0.0150146484375, + "loss": 0.0089, + "step": 2817, + "total_loss": 0.0150146484375 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019352122922521717, + "lm_loss": 0.007049560546875, + "loss": 0.0067, + "step": 2818, + "total_loss": 0.007049560546875 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019351668062582062, + "lm_loss": 0.012939453125, + "loss": 0.0077, + "step": 2819, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019351213048374383, + "lm_loss": 0.01544189453125, + "loss": 0.0077, + "step": 2820, + "total_loss": 0.01544189453125 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019350757879906184, + "lm_loss": 0.0079345703125, + "loss": 0.0079, + "step": 2821, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001935030255718497, + "lm_loss": 0.00970458984375, + "loss": 0.0064, + "step": 2822, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001934984708021826, + "lm_loss": 0.0091552734375, + "loss": 0.0087, + "step": 2823, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019349391449013562, + "lm_loss": 0.0072021484375, + "loss": 0.0085, + "step": 2824, + "total_loss": 0.0072021484375 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019348935663578392, + "lm_loss": 0.0037384033203125, + "loss": 0.0066, + "step": 2825, + "total_loss": 0.0037384033203125 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019348479723920272, + "lm_loss": 0.00634765625, + "loss": 0.0062, + "step": 2826, + "total_loss": 0.00634765625 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001934802363004672, + "lm_loss": 0.00518798828125, + "loss": 0.0074, + "step": 2827, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019347567381965264, + "lm_loss": 0.004791259765625, + "loss": 0.007, + "step": 2828, + "total_loss": 0.004791259765625 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019347110979683427, + "lm_loss": 0.009033203125, + "loss": 0.0079, + "step": 2829, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001934665442320874, + "lm_loss": 0.006072998046875, + "loss": 0.0072, + "step": 2830, + "total_loss": 0.006072998046875 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019346197712548732, + "lm_loss": 0.004730224609375, + "loss": 0.0068, + "step": 2831, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019345740847710938, + "lm_loss": 0.00127410888671875, + "loss": 0.0066, + "step": 2832, + "total_loss": 0.00127410888671875 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019345283828702895, + "lm_loss": 0.00286865234375, + "loss": 0.0071, + "step": 2833, + "total_loss": 0.00286865234375 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019344826655532144, + "lm_loss": 0.004730224609375, + "loss": 0.007, + "step": 2834, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019344369328206228, + "lm_loss": 0.0108642578125, + "loss": 0.0078, + "step": 2835, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019343911846732683, + "lm_loss": 0.010986328125, + "loss": 0.0072, + "step": 2836, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019343454211119063, + "lm_loss": 0.00860595703125, + "loss": 0.0074, + "step": 2837, + "total_loss": 0.00860595703125 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019342996421372913, + "lm_loss": 0.00848388671875, + "loss": 0.0078, + "step": 2838, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001934253847750179, + "lm_loss": 0.014404296875, + "loss": 0.0068, + "step": 2839, + "total_loss": 0.014404296875 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019342080379513242, + "lm_loss": 0.00830078125, + "loss": 0.0082, + "step": 2840, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019341622127414833, + "lm_loss": 0.01104736328125, + "loss": 0.0074, + "step": 2841, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019341163721214117, + "lm_loss": 0.003631591796875, + "loss": 0.0063, + "step": 2842, + "total_loss": 0.003631591796875 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019340705160918658, + "lm_loss": 0.00933837890625, + "loss": 0.0089, + "step": 2843, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001934024644653602, + "lm_loss": 0.007476806640625, + "loss": 0.0071, + "step": 2844, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019339787578073772, + "lm_loss": 0.0069580078125, + "loss": 0.0074, + "step": 2845, + "total_loss": 0.0069580078125 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001933932855553948, + "lm_loss": 0.0054931640625, + "loss": 0.0078, + "step": 2846, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001933886937894072, + "lm_loss": 0.0016326904296875, + "loss": 0.0069, + "step": 2847, + "total_loss": 0.0016326904296875 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019338410048285064, + "lm_loss": 0.0194091796875, + "loss": 0.0088, + "step": 2848, + "total_loss": 0.0194091796875 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001933795056358009, + "lm_loss": 0.0166015625, + "loss": 0.0088, + "step": 2849, + "total_loss": 0.0166015625 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019337490924833378, + "lm_loss": 0.00885009765625, + "loss": 0.0081, + "step": 2850, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001933703113205251, + "lm_loss": 0.0036468505859375, + "loss": 0.0081, + "step": 2851, + "total_loss": 0.0036468505859375 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019336571185245074, + "lm_loss": 0.004364013671875, + "loss": 0.009, + "step": 2852, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019336111084418653, + "lm_loss": 0.005279541015625, + "loss": 0.0097, + "step": 2853, + "total_loss": 0.005279541015625 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019335650829580838, + "lm_loss": 0.01153564453125, + "loss": 0.0072, + "step": 2854, + "total_loss": 0.01153564453125 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019335190420739223, + "lm_loss": 0.006500244140625, + "loss": 0.0092, + "step": 2855, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019334729857901404, + "lm_loss": 0.00994873046875, + "loss": 0.0092, + "step": 2856, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019334269141074972, + "lm_loss": 0.0048828125, + "loss": 0.0073, + "step": 2857, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019333808270267537, + "lm_loss": 0.005096435546875, + "loss": 0.008, + "step": 2858, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019333347245486694, + "lm_loss": 0.00811767578125, + "loss": 0.007, + "step": 2859, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001933288606674005, + "lm_loss": 0.005126953125, + "loss": 0.0058, + "step": 2860, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019332424734035217, + "lm_loss": 0.005340576171875, + "loss": 0.0068, + "step": 2861, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019331963247379798, + "lm_loss": 0.005157470703125, + "loss": 0.0083, + "step": 2862, + "total_loss": 0.005157470703125 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019331501606781415, + "lm_loss": 0.00335693359375, + "loss": 0.0068, + "step": 2863, + "total_loss": 0.00335693359375 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019331039812247672, + "lm_loss": 0.00286865234375, + "loss": 0.0061, + "step": 2864, + "total_loss": 0.00286865234375 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019330577863786196, + "lm_loss": 0.00738525390625, + "loss": 0.0073, + "step": 2865, + "total_loss": 0.00738525390625 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019330115761404603, + "lm_loss": 0.010498046875, + "loss": 0.0075, + "step": 2866, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001932965350511052, + "lm_loss": 0.006317138671875, + "loss": 0.0087, + "step": 2867, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019329191094911568, + "lm_loss": 0.0035400390625, + "loss": 0.0086, + "step": 2868, + "total_loss": 0.0035400390625 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019328728530815375, + "lm_loss": 0.00421142578125, + "loss": 0.0076, + "step": 2869, + "total_loss": 0.00421142578125 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019328265812829578, + "lm_loss": 0.01171875, + "loss": 0.0111, + "step": 2870, + "total_loss": 0.01171875 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019327802940961807, + "lm_loss": 0.01177978515625, + "loss": 0.0069, + "step": 2871, + "total_loss": 0.01177978515625 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001932733991521969, + "lm_loss": 0.00543212890625, + "loss": 0.0087, + "step": 2872, + "total_loss": 0.00543212890625 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019326876735610874, + "lm_loss": 0.00653076171875, + "loss": 0.0065, + "step": 2873, + "total_loss": 0.00653076171875 + }, + { + "epoch": 1.17, + "learning_rate": 0.00019326413402142998, + "lm_loss": 0.0091552734375, + "loss": 0.0076, + "step": 2874, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019325949914823705, + "lm_loss": 0.00677490234375, + "loss": 0.0079, + "step": 2875, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019325486273660643, + "lm_loss": 0.00830078125, + "loss": 0.008, + "step": 2876, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019325022478661454, + "lm_loss": 0.005035400390625, + "loss": 0.0086, + "step": 2877, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019324558529833794, + "lm_loss": 0.0108642578125, + "loss": 0.0094, + "step": 2878, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019324094427185316, + "lm_loss": 0.009033203125, + "loss": 0.0066, + "step": 2879, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019323630170723673, + "lm_loss": 0.0086669921875, + "loss": 0.0094, + "step": 2880, + "total_loss": 0.0086669921875 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019323165760456526, + "lm_loss": 0.0118408203125, + "loss": 0.0094, + "step": 2881, + "total_loss": 0.0118408203125 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019322701196391537, + "lm_loss": 0.00543212890625, + "loss": 0.0072, + "step": 2882, + "total_loss": 0.00543212890625 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001932223647853637, + "lm_loss": 0.010986328125, + "loss": 0.0082, + "step": 2883, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019321771606898688, + "lm_loss": 0.004119873046875, + "loss": 0.0078, + "step": 2884, + "total_loss": 0.004119873046875 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001932130658148616, + "lm_loss": 0.01611328125, + "loss": 0.0094, + "step": 2885, + "total_loss": 0.01611328125 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001932084140230646, + "lm_loss": 0.007568359375, + "loss": 0.0065, + "step": 2886, + "total_loss": 0.007568359375 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019320376069367258, + "lm_loss": 0.002685546875, + "loss": 0.0075, + "step": 2887, + "total_loss": 0.002685546875 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019319910582676236, + "lm_loss": 0.006195068359375, + "loss": 0.0083, + "step": 2888, + "total_loss": 0.006195068359375 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019319444942241066, + "lm_loss": 0.007415771484375, + "loss": 0.0061, + "step": 2889, + "total_loss": 0.007415771484375 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019318979148069435, + "lm_loss": 0.007476806640625, + "loss": 0.0075, + "step": 2890, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019318513200169022, + "lm_loss": 0.005126953125, + "loss": 0.0079, + "step": 2891, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019318047098547517, + "lm_loss": 0.0101318359375, + "loss": 0.0074, + "step": 2892, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019317580843212606, + "lm_loss": 0.01043701171875, + "loss": 0.0077, + "step": 2893, + "total_loss": 0.01043701171875 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019317114434171985, + "lm_loss": 0.0101318359375, + "loss": 0.0052, + "step": 2894, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019316647871433343, + "lm_loss": 0.006622314453125, + "loss": 0.007, + "step": 2895, + "total_loss": 0.006622314453125 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001931618115500438, + "lm_loss": 0.006683349609375, + "loss": 0.0079, + "step": 2896, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019315714284892795, + "lm_loss": 0.00750732421875, + "loss": 0.0093, + "step": 2897, + "total_loss": 0.00750732421875 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019315247261106287, + "lm_loss": 0.007720947265625, + "loss": 0.0067, + "step": 2898, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019314780083652563, + "lm_loss": 0.0098876953125, + "loss": 0.0092, + "step": 2899, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001931431275253933, + "lm_loss": 0.00250244140625, + "loss": 0.008, + "step": 2900, + "total_loss": 0.00250244140625 + }, + { + "epoch": 1.19, + "eval_lm_loss": 0.00999415386468172, + "eval_loss": 0.010283890180289745, + "eval_runtime": 44.1302, + "eval_samples_per_second": 22.66, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.00999415386468172, + "lm_loss": 0.00121307373046875, + "step": 2900, + "total_loss": 0.00121307373046875 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019313845267774293, + "lm_loss": 0.005401611328125, + "loss": 0.0077, + "step": 2901, + "total_loss": 0.005401611328125 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019313377629365168, + "lm_loss": 0.005889892578125, + "loss": 0.0061, + "step": 2902, + "total_loss": 0.005889892578125 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001931290983731967, + "lm_loss": 0.00640869140625, + "loss": 0.0064, + "step": 2903, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019312441891645508, + "lm_loss": 0.008056640625, + "loss": 0.0085, + "step": 2904, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001931197379235041, + "lm_loss": 0.0086669921875, + "loss": 0.0083, + "step": 2905, + "total_loss": 0.0086669921875 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019311505539442095, + "lm_loss": 0.01519775390625, + "loss": 0.0066, + "step": 2906, + "total_loss": 0.01519775390625 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001931103713292829, + "lm_loss": 0.004791259765625, + "loss": 0.0077, + "step": 2907, + "total_loss": 0.004791259765625 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019310568572816715, + "lm_loss": 0.004974365234375, + "loss": 0.0072, + "step": 2908, + "total_loss": 0.004974365234375 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019310099859115107, + "lm_loss": 0.00738525390625, + "loss": 0.0091, + "step": 2909, + "total_loss": 0.00738525390625 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019309630991831193, + "lm_loss": 0.0133056640625, + "loss": 0.0075, + "step": 2910, + "total_loss": 0.0133056640625 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001930916197097271, + "lm_loss": 0.00439453125, + "loss": 0.0066, + "step": 2911, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019308692796547394, + "lm_loss": 0.00994873046875, + "loss": 0.0074, + "step": 2912, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001930822346856299, + "lm_loss": 0.004364013671875, + "loss": 0.0066, + "step": 2913, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019307753987027232, + "lm_loss": 0.0052490234375, + "loss": 0.0094, + "step": 2914, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019307284351947868, + "lm_loss": 0.0135498046875, + "loss": 0.0088, + "step": 2915, + "total_loss": 0.0135498046875 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019306814563332646, + "lm_loss": 0.0118408203125, + "loss": 0.0076, + "step": 2916, + "total_loss": 0.0118408203125 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001930634462118931, + "lm_loss": 0.0081787109375, + "loss": 0.0073, + "step": 2917, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019305874525525624, + "lm_loss": 0.00677490234375, + "loss": 0.0081, + "step": 2918, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019305404276349336, + "lm_loss": 0.00494384765625, + "loss": 0.0068, + "step": 2919, + "total_loss": 0.00494384765625 + }, + { + "epoch": 1.19, + "learning_rate": 0.000193049338736682, + "lm_loss": 0.00836181640625, + "loss": 0.0073, + "step": 2920, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019304463317489982, + "lm_loss": 0.007720947265625, + "loss": 0.0067, + "step": 2921, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019303992607822443, + "lm_loss": 0.01123046875, + "loss": 0.0086, + "step": 2922, + "total_loss": 0.01123046875 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019303521744673345, + "lm_loss": 0.004241943359375, + "loss": 0.0077, + "step": 2923, + "total_loss": 0.004241943359375 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001930305072805046, + "lm_loss": 0.00701904296875, + "loss": 0.0068, + "step": 2924, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019302579557961552, + "lm_loss": 0.007293701171875, + "loss": 0.0077, + "step": 2925, + "total_loss": 0.007293701171875 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019302108234414402, + "lm_loss": 0.00531005859375, + "loss": 0.0053, + "step": 2926, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019301636757416776, + "lm_loss": 0.00933837890625, + "loss": 0.0079, + "step": 2927, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019301165126976456, + "lm_loss": 0.004638671875, + "loss": 0.0085, + "step": 2928, + "total_loss": 0.004638671875 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019300693343101227, + "lm_loss": 0.0069580078125, + "loss": 0.0079, + "step": 2929, + "total_loss": 0.0069580078125 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019300221405798863, + "lm_loss": 0.00811767578125, + "loss": 0.0093, + "step": 2930, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019299749315077153, + "lm_loss": 0.006378173828125, + "loss": 0.0082, + "step": 2931, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019299277070943885, + "lm_loss": 0.015869140625, + "loss": 0.0071, + "step": 2932, + "total_loss": 0.015869140625 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001929880467340685, + "lm_loss": 0.01263427734375, + "loss": 0.0094, + "step": 2933, + "total_loss": 0.01263427734375 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019298332122473838, + "lm_loss": 0.004302978515625, + "loss": 0.0087, + "step": 2934, + "total_loss": 0.004302978515625 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019297859418152647, + "lm_loss": 0.01312255859375, + "loss": 0.0081, + "step": 2935, + "total_loss": 0.01312255859375 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019297386560451077, + "lm_loss": 0.007537841796875, + "loss": 0.0082, + "step": 2936, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019296913549376923, + "lm_loss": 0.01141357421875, + "loss": 0.0076, + "step": 2937, + "total_loss": 0.01141357421875 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001929644038493799, + "lm_loss": 0.00701904296875, + "loss": 0.0084, + "step": 2938, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019295967067142083, + "lm_loss": 0.009765625, + "loss": 0.0073, + "step": 2939, + "total_loss": 0.009765625 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019295493595997013, + "lm_loss": 0.00323486328125, + "loss": 0.0063, + "step": 2940, + "total_loss": 0.00323486328125 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001929501997151059, + "lm_loss": 0.005096435546875, + "loss": 0.0091, + "step": 2941, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001929454619369062, + "lm_loss": 0.010009765625, + "loss": 0.0071, + "step": 2942, + "total_loss": 0.010009765625 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019294072262544929, + "lm_loss": 0.009521484375, + "loss": 0.0068, + "step": 2943, + "total_loss": 0.009521484375 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019293598178081326, + "lm_loss": 0.0111083984375, + "loss": 0.0089, + "step": 2944, + "total_loss": 0.0111083984375 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001929312394030764, + "lm_loss": 0.0130615234375, + "loss": 0.007, + "step": 2945, + "total_loss": 0.0130615234375 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019292649549231688, + "lm_loss": 0.004241943359375, + "loss": 0.0059, + "step": 2946, + "total_loss": 0.004241943359375 + }, + { + "epoch": 1.2, + "learning_rate": 0.00019292175004861296, + "lm_loss": 0.00189971923828125, + "loss": 0.0073, + "step": 2947, + "total_loss": 0.00189971923828125 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019291700307204297, + "lm_loss": 0.007476806640625, + "loss": 0.0087, + "step": 2948, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019291225456268517, + "lm_loss": 0.004150390625, + "loss": 0.008, + "step": 2949, + "total_loss": 0.004150390625 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001929075045206179, + "lm_loss": 0.0081787109375, + "loss": 0.0089, + "step": 2950, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019290275294591954, + "lm_loss": 0.003936767578125, + "loss": 0.0075, + "step": 2951, + "total_loss": 0.003936767578125 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019289799983866846, + "lm_loss": 0.004669189453125, + "loss": 0.0088, + "step": 2952, + "total_loss": 0.004669189453125 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001928932451989431, + "lm_loss": 0.007293701171875, + "loss": 0.0087, + "step": 2953, + "total_loss": 0.007293701171875 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019288848902682183, + "lm_loss": 0.0081787109375, + "loss": 0.0065, + "step": 2954, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019288373132238316, + "lm_loss": 0.01129150390625, + "loss": 0.0084, + "step": 2955, + "total_loss": 0.01129150390625 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019287897208570553, + "lm_loss": 0.0079345703125, + "loss": 0.0091, + "step": 2956, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001928742113168675, + "lm_loss": 0.004608154296875, + "loss": 0.0078, + "step": 2957, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019286944901594758, + "lm_loss": 0.005401611328125, + "loss": 0.0077, + "step": 2958, + "total_loss": 0.005401611328125 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019286468518302433, + "lm_loss": 0.00689697265625, + "loss": 0.0088, + "step": 2959, + "total_loss": 0.00689697265625 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019285991981817636, + "lm_loss": 0.0120849609375, + "loss": 0.0076, + "step": 2960, + "total_loss": 0.0120849609375 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019285515292148225, + "lm_loss": 0.006744384765625, + "loss": 0.0083, + "step": 2961, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019285038449302062, + "lm_loss": 0.0024871826171875, + "loss": 0.0072, + "step": 2962, + "total_loss": 0.0024871826171875 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019284561453287018, + "lm_loss": 0.007537841796875, + "loss": 0.0074, + "step": 2963, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019284084304110958, + "lm_loss": 0.008056640625, + "loss": 0.0096, + "step": 2964, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019283607001781755, + "lm_loss": 0.0037994384765625, + "loss": 0.0071, + "step": 2965, + "total_loss": 0.0037994384765625 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019283129546307284, + "lm_loss": 0.0047607421875, + "loss": 0.0086, + "step": 2966, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001928265193769542, + "lm_loss": 0.00726318359375, + "loss": 0.0093, + "step": 2967, + "total_loss": 0.00726318359375 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001928217417595404, + "lm_loss": 0.01068115234375, + "loss": 0.0068, + "step": 2968, + "total_loss": 0.01068115234375 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019281696261091026, + "lm_loss": 0.02197265625, + "loss": 0.0085, + "step": 2969, + "total_loss": 0.02197265625 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019281218193114262, + "lm_loss": 0.006103515625, + "loss": 0.0066, + "step": 2970, + "total_loss": 0.006103515625 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001928073997203164, + "lm_loss": 0.00994873046875, + "loss": 0.0086, + "step": 2971, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001928026159785104, + "lm_loss": 0.019775390625, + "loss": 0.0096, + "step": 2972, + "total_loss": 0.019775390625 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019279783070580355, + "lm_loss": 0.004150390625, + "loss": 0.0064, + "step": 2973, + "total_loss": 0.004150390625 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019279304390227485, + "lm_loss": 0.0040283203125, + "loss": 0.0062, + "step": 2974, + "total_loss": 0.0040283203125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001927882555680032, + "lm_loss": 0.0045166015625, + "loss": 0.0058, + "step": 2975, + "total_loss": 0.0045166015625 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019278346570306764, + "lm_loss": 0.004150390625, + "loss": 0.0066, + "step": 2976, + "total_loss": 0.004150390625 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001927786743075471, + "lm_loss": 0.00927734375, + "loss": 0.0091, + "step": 2977, + "total_loss": 0.00927734375 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019277388138152075, + "lm_loss": 0.006988525390625, + "loss": 0.0076, + "step": 2978, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019276908692506756, + "lm_loss": 0.00439453125, + "loss": 0.0083, + "step": 2979, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019276429093826667, + "lm_loss": 0.00439453125, + "loss": 0.0086, + "step": 2980, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019275949342119716, + "lm_loss": 0.006103515625, + "loss": 0.0078, + "step": 2981, + "total_loss": 0.006103515625 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019275469437393818, + "lm_loss": 0.006011962890625, + "loss": 0.0076, + "step": 2982, + "total_loss": 0.006011962890625 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001927498937965689, + "lm_loss": 0.01513671875, + "loss": 0.0095, + "step": 2983, + "total_loss": 0.01513671875 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019274509168916852, + "lm_loss": 0.00701904296875, + "loss": 0.0075, + "step": 2984, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019274028805181623, + "lm_loss": 0.00823974609375, + "loss": 0.0075, + "step": 2985, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001927354828845913, + "lm_loss": 0.005035400390625, + "loss": 0.0082, + "step": 2986, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019273067618757304, + "lm_loss": 0.0054931640625, + "loss": 0.0077, + "step": 2987, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019272586796084063, + "lm_loss": 0.003326416015625, + "loss": 0.006, + "step": 2988, + "total_loss": 0.003326416015625 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001927210582044735, + "lm_loss": 0.0216064453125, + "loss": 0.009, + "step": 2989, + "total_loss": 0.0216064453125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001927162469185509, + "lm_loss": 0.002960205078125, + "loss": 0.0075, + "step": 2990, + "total_loss": 0.002960205078125 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019271143410315227, + "lm_loss": 0.00262451171875, + "loss": 0.0078, + "step": 2991, + "total_loss": 0.00262451171875 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019270661975835695, + "lm_loss": 0.004852294921875, + "loss": 0.007, + "step": 2992, + "total_loss": 0.004852294921875 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019270180388424442, + "lm_loss": 0.010009765625, + "loss": 0.0096, + "step": 2993, + "total_loss": 0.010009765625 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019269698648089408, + "lm_loss": 0.01202392578125, + "loss": 0.0076, + "step": 2994, + "total_loss": 0.01202392578125 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019269216754838538, + "lm_loss": 0.00116729736328125, + "loss": 0.0083, + "step": 2995, + "total_loss": 0.00116729736328125 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019268734708679787, + "lm_loss": 0.00531005859375, + "loss": 0.0074, + "step": 2996, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019268252509621104, + "lm_loss": 0.0048828125, + "loss": 0.0088, + "step": 2997, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019267770157670446, + "lm_loss": 0.005340576171875, + "loss": 0.0065, + "step": 2998, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019267287652835764, + "lm_loss": 0.003753662109375, + "loss": 0.0069, + "step": 2999, + "total_loss": 0.003753662109375 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019266804995125023, + "lm_loss": 0.013671875, + "loss": 0.007, + "step": 3000, + "total_loss": 0.013671875 + }, + { + "epoch": 1.23, + "eval_lm_loss": 0.009405948221683502, + "eval_loss": 0.009687041863799095, + "eval_runtime": 43.9445, + "eval_samples_per_second": 22.756, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009405948221683502, + "lm_loss": 0.0019073486328125, + "step": 3000, + "total_loss": 0.0019073486328125 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019266322184546185, + "lm_loss": 0.00439453125, + "loss": 0.0066, + "step": 3001, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019265839221107212, + "lm_loss": 0.00897216796875, + "loss": 0.0098, + "step": 3002, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001926535610481607, + "lm_loss": 0.00848388671875, + "loss": 0.0069, + "step": 3003, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019264872835680734, + "lm_loss": 0.005462646484375, + "loss": 0.0071, + "step": 3004, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019264389413709172, + "lm_loss": 0.0047607421875, + "loss": 0.0079, + "step": 3005, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001926390583890936, + "lm_loss": 0.0036163330078125, + "loss": 0.0108, + "step": 3006, + "total_loss": 0.0036163330078125 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019263422111289274, + "lm_loss": 0.0052490234375, + "loss": 0.0081, + "step": 3007, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019262938230856896, + "lm_loss": 0.00494384765625, + "loss": 0.0085, + "step": 3008, + "total_loss": 0.00494384765625 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019262454197620205, + "lm_loss": 0.005950927734375, + "loss": 0.0085, + "step": 3009, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019261970011587187, + "lm_loss": 0.00677490234375, + "loss": 0.0075, + "step": 3010, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019261485672765832, + "lm_loss": 0.004425048828125, + "loss": 0.0101, + "step": 3011, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019261001181164128, + "lm_loss": 0.005157470703125, + "loss": 0.0067, + "step": 3012, + "total_loss": 0.005157470703125 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019260516536790068, + "lm_loss": 0.0107421875, + "loss": 0.0077, + "step": 3013, + "total_loss": 0.0107421875 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019260031739651642, + "lm_loss": 0.0084228515625, + "loss": 0.0076, + "step": 3014, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019259546789756853, + "lm_loss": 0.0079345703125, + "loss": 0.0075, + "step": 3015, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.23, + "learning_rate": 0.000192590616871137, + "lm_loss": 0.00836181640625, + "loss": 0.0079, + "step": 3016, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019258576431730183, + "lm_loss": 0.0126953125, + "loss": 0.0084, + "step": 3017, + "total_loss": 0.0126953125 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001925809102361431, + "lm_loss": 0.00848388671875, + "loss": 0.0073, + "step": 3018, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.23, + "learning_rate": 0.00019257605462774086, + "lm_loss": 0.01190185546875, + "loss": 0.0081, + "step": 3019, + "total_loss": 0.01190185546875 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001925711974921752, + "lm_loss": 0.0091552734375, + "loss": 0.0065, + "step": 3020, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001925663388295263, + "lm_loss": 0.007049560546875, + "loss": 0.0063, + "step": 3021, + "total_loss": 0.007049560546875 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019256147863987426, + "lm_loss": 0.0118408203125, + "loss": 0.0075, + "step": 3022, + "total_loss": 0.0118408203125 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019255661692329926, + "lm_loss": 0.01300048828125, + "loss": 0.0079, + "step": 3023, + "total_loss": 0.01300048828125 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019255175367988153, + "lm_loss": 0.00494384765625, + "loss": 0.0071, + "step": 3024, + "total_loss": 0.00494384765625 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019254688890970125, + "lm_loss": 0.005584716796875, + "loss": 0.0064, + "step": 3025, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001925420226128387, + "lm_loss": 0.008544921875, + "loss": 0.0091, + "step": 3026, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019253715478937417, + "lm_loss": 0.01171875, + "loss": 0.0082, + "step": 3027, + "total_loss": 0.01171875 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019253228543938792, + "lm_loss": 0.005096435546875, + "loss": 0.0072, + "step": 3028, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001925274145629603, + "lm_loss": 0.002593994140625, + "loss": 0.0079, + "step": 3029, + "total_loss": 0.002593994140625 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019252254216017167, + "lm_loss": 0.01385498046875, + "loss": 0.0098, + "step": 3030, + "total_loss": 0.01385498046875 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019251766823110238, + "lm_loss": 0.004608154296875, + "loss": 0.007, + "step": 3031, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019251279277583287, + "lm_loss": 0.005035400390625, + "loss": 0.0073, + "step": 3032, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019250791579444353, + "lm_loss": 0.00958251953125, + "loss": 0.0087, + "step": 3033, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019250303728701485, + "lm_loss": 0.012939453125, + "loss": 0.007, + "step": 3034, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019249815725362726, + "lm_loss": 0.0079345703125, + "loss": 0.0079, + "step": 3035, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019249327569436132, + "lm_loss": 0.0027923583984375, + "loss": 0.0079, + "step": 3036, + "total_loss": 0.0027923583984375 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001924883926092975, + "lm_loss": 0.0052490234375, + "loss": 0.0081, + "step": 3037, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019248350799851637, + "lm_loss": 0.01416015625, + "loss": 0.0072, + "step": 3038, + "total_loss": 0.01416015625 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019247862186209854, + "lm_loss": 0.0078125, + "loss": 0.0075, + "step": 3039, + "total_loss": 0.0078125 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019247373420012458, + "lm_loss": 0.00811767578125, + "loss": 0.0075, + "step": 3040, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019246884501267512, + "lm_loss": 0.01043701171875, + "loss": 0.0081, + "step": 3041, + "total_loss": 0.01043701171875 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001924639542998308, + "lm_loss": 0.011474609375, + "loss": 0.0088, + "step": 3042, + "total_loss": 0.011474609375 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019245906206167237, + "lm_loss": 0.0042724609375, + "loss": 0.007, + "step": 3043, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019245416829828047, + "lm_loss": 0.0108642578125, + "loss": 0.0085, + "step": 3044, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019244927300973586, + "lm_loss": 0.0091552734375, + "loss": 0.0082, + "step": 3045, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019244437619611923, + "lm_loss": 0.004638671875, + "loss": 0.0063, + "step": 3046, + "total_loss": 0.004638671875 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019243947785751143, + "lm_loss": 0.01275634765625, + "loss": 0.0086, + "step": 3047, + "total_loss": 0.01275634765625 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019243457799399325, + "lm_loss": 0.0076904296875, + "loss": 0.01, + "step": 3048, + "total_loss": 0.0076904296875 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001924296766056455, + "lm_loss": 0.0108642578125, + "loss": 0.0092, + "step": 3049, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019242477369254906, + "lm_loss": 0.0084228515625, + "loss": 0.0093, + "step": 3050, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019241986925478476, + "lm_loss": 0.0103759765625, + "loss": 0.0081, + "step": 3051, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001924149632924336, + "lm_loss": 0.00506591796875, + "loss": 0.0054, + "step": 3052, + "total_loss": 0.00506591796875 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001924100558055764, + "lm_loss": 0.0096435546875, + "loss": 0.0072, + "step": 3053, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019240514679429418, + "lm_loss": 0.0089111328125, + "loss": 0.008, + "step": 3054, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019240023625866792, + "lm_loss": 0.0130615234375, + "loss": 0.0094, + "step": 3055, + "total_loss": 0.0130615234375 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001923953241987786, + "lm_loss": 0.01373291015625, + "loss": 0.0078, + "step": 3056, + "total_loss": 0.01373291015625 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001923904106147073, + "lm_loss": 0.00811767578125, + "loss": 0.0072, + "step": 3057, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.25, + "learning_rate": 0.000192385495506535, + "lm_loss": 0.005828857421875, + "loss": 0.006, + "step": 3058, + "total_loss": 0.005828857421875 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019238057887434283, + "lm_loss": 0.0159912109375, + "loss": 0.0091, + "step": 3059, + "total_loss": 0.0159912109375 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019237566071821189, + "lm_loss": 0.0111083984375, + "loss": 0.0092, + "step": 3060, + "total_loss": 0.0111083984375 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001923707410382233, + "lm_loss": 0.004852294921875, + "loss": 0.0066, + "step": 3061, + "total_loss": 0.004852294921875 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019236581983445822, + "lm_loss": 0.00787353515625, + "loss": 0.0091, + "step": 3062, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019236089710699785, + "lm_loss": 0.0048828125, + "loss": 0.0059, + "step": 3063, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019235597285592338, + "lm_loss": 0.00811767578125, + "loss": 0.0073, + "step": 3064, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019235104708131603, + "lm_loss": 0.002716064453125, + "loss": 0.0088, + "step": 3065, + "total_loss": 0.002716064453125 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019234611978325713, + "lm_loss": 0.00787353515625, + "loss": 0.0088, + "step": 3066, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019234119096182785, + "lm_loss": 0.0045166015625, + "loss": 0.0065, + "step": 3067, + "total_loss": 0.0045166015625 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019233626061710955, + "lm_loss": 0.01129150390625, + "loss": 0.0065, + "step": 3068, + "total_loss": 0.01129150390625 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001923313287491836, + "lm_loss": 0.0025177001953125, + "loss": 0.0076, + "step": 3069, + "total_loss": 0.0025177001953125 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001923263953581313, + "lm_loss": 0.00885009765625, + "loss": 0.0076, + "step": 3070, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019232146044403407, + "lm_loss": 0.0096435546875, + "loss": 0.0074, + "step": 3071, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001923165240069733, + "lm_loss": 0.0042724609375, + "loss": 0.008, + "step": 3072, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001923115860470304, + "lm_loss": 0.01220703125, + "loss": 0.009, + "step": 3073, + "total_loss": 0.01220703125 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019230664656428692, + "lm_loss": 0.01251220703125, + "loss": 0.0085, + "step": 3074, + "total_loss": 0.01251220703125 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019230170555882423, + "lm_loss": 0.009765625, + "loss": 0.0109, + "step": 3075, + "total_loss": 0.009765625 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019229676303072387, + "lm_loss": 0.0036468505859375, + "loss": 0.0071, + "step": 3076, + "total_loss": 0.0036468505859375 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019229181898006742, + "lm_loss": 0.0037841796875, + "loss": 0.0083, + "step": 3077, + "total_loss": 0.0037841796875 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019228687340693638, + "lm_loss": 0.00640869140625, + "loss": 0.007, + "step": 3078, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001922819263114124, + "lm_loss": 0.007568359375, + "loss": 0.0071, + "step": 3079, + "total_loss": 0.007568359375 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019227697769357702, + "lm_loss": 0.0155029296875, + "loss": 0.0094, + "step": 3080, + "total_loss": 0.0155029296875 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001922720275535119, + "lm_loss": 0.01202392578125, + "loss": 0.01, + "step": 3081, + "total_loss": 0.01202392578125 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019226707589129876, + "lm_loss": 0.01055908203125, + "loss": 0.0088, + "step": 3082, + "total_loss": 0.01055908203125 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019226212270701917, + "lm_loss": 0.00787353515625, + "loss": 0.0076, + "step": 3083, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001922571680007549, + "lm_loss": 0.004608154296875, + "loss": 0.0077, + "step": 3084, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001922522117725877, + "lm_loss": 0.005218505859375, + "loss": 0.0069, + "step": 3085, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019224725402259933, + "lm_loss": 0.01336669921875, + "loss": 0.0082, + "step": 3086, + "total_loss": 0.01336669921875 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019224229475087152, + "lm_loss": 0.004425048828125, + "loss": 0.0081, + "step": 3087, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019223733395748615, + "lm_loss": 0.00579833984375, + "loss": 0.0068, + "step": 3088, + "total_loss": 0.00579833984375 + }, + { + "epoch": 1.26, + "learning_rate": 0.000192232371642525, + "lm_loss": 0.00909423828125, + "loss": 0.0077, + "step": 3089, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019222740780606995, + "lm_loss": 0.007659912109375, + "loss": 0.0087, + "step": 3090, + "total_loss": 0.007659912109375 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019222244244820286, + "lm_loss": 0.0093994140625, + "loss": 0.0069, + "step": 3091, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019221747556900568, + "lm_loss": 0.01031494140625, + "loss": 0.0093, + "step": 3092, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019221250716856036, + "lm_loss": 0.010498046875, + "loss": 0.0088, + "step": 3093, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.26, + "learning_rate": 0.0001922075372469488, + "lm_loss": 0.0027008056640625, + "loss": 0.0081, + "step": 3094, + "total_loss": 0.0027008056640625 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019220256580425305, + "lm_loss": 0.0081787109375, + "loss": 0.0066, + "step": 3095, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019219759284055504, + "lm_loss": 0.0103759765625, + "loss": 0.0076, + "step": 3096, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019219261835593687, + "lm_loss": 0.00909423828125, + "loss": 0.0074, + "step": 3097, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019218764235048058, + "lm_loss": 0.0140380859375, + "loss": 0.0094, + "step": 3098, + "total_loss": 0.0140380859375 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019218266482426828, + "lm_loss": 0.00653076171875, + "loss": 0.0075, + "step": 3099, + "total_loss": 0.00653076171875 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019217768577738204, + "lm_loss": 0.00946044921875, + "loss": 0.0069, + "step": 3100, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.27, + "eval_lm_loss": 0.009472317062318325, + "eval_loss": 0.00977653544396162, + "eval_runtime": 43.9397, + "eval_samples_per_second": 22.758, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009472317062318325, + "lm_loss": 0.000766754150390625, + "step": 3100, + "total_loss": 0.000766754150390625 + }, + { + "epoch": 1.27, + "learning_rate": 0.000192172705209904, + "lm_loss": 0.0084228515625, + "loss": 0.0087, + "step": 3101, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019216772312191638, + "lm_loss": 0.007415771484375, + "loss": 0.008, + "step": 3102, + "total_loss": 0.007415771484375 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019216273951350128, + "lm_loss": 0.009033203125, + "loss": 0.0084, + "step": 3103, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.27, + "learning_rate": 0.000192157754384741, + "lm_loss": 0.005889892578125, + "loss": 0.0073, + "step": 3104, + "total_loss": 0.005889892578125 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019215276773571768, + "lm_loss": 0.0031280517578125, + "loss": 0.0059, + "step": 3105, + "total_loss": 0.0031280517578125 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019214777956651365, + "lm_loss": 0.00994873046875, + "loss": 0.0083, + "step": 3106, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019214278987721118, + "lm_loss": 0.00616455078125, + "loss": 0.0066, + "step": 3107, + "total_loss": 0.00616455078125 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019213779866789258, + "lm_loss": 0.00830078125, + "loss": 0.0081, + "step": 3108, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019213280593864015, + "lm_loss": 0.0027923583984375, + "loss": 0.0075, + "step": 3109, + "total_loss": 0.0027923583984375 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019212781168953633, + "lm_loss": 0.00836181640625, + "loss": 0.0086, + "step": 3110, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019212281592066347, + "lm_loss": 0.005706787109375, + "loss": 0.0089, + "step": 3111, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019211781863210395, + "lm_loss": 0.00494384765625, + "loss": 0.0095, + "step": 3112, + "total_loss": 0.00494384765625 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019211281982394021, + "lm_loss": 0.0185546875, + "loss": 0.0092, + "step": 3113, + "total_loss": 0.0185546875 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001921078194962548, + "lm_loss": 0.00274658203125, + "loss": 0.0067, + "step": 3114, + "total_loss": 0.00274658203125 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001921028176491301, + "lm_loss": 0.0026092529296875, + "loss": 0.0067, + "step": 3115, + "total_loss": 0.0026092529296875 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019209781428264865, + "lm_loss": 0.00494384765625, + "loss": 0.0062, + "step": 3116, + "total_loss": 0.00494384765625 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019209280939689303, + "lm_loss": 0.01385498046875, + "loss": 0.0075, + "step": 3117, + "total_loss": 0.01385498046875 + }, + { + "epoch": 1.27, + "learning_rate": 0.00019208780299194574, + "lm_loss": 0.0101318359375, + "loss": 0.0093, + "step": 3118, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019208279506788945, + "lm_loss": 0.0025787353515625, + "loss": 0.0064, + "step": 3119, + "total_loss": 0.0025787353515625 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019207778562480666, + "lm_loss": 0.01507568359375, + "loss": 0.008, + "step": 3120, + "total_loss": 0.01507568359375 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001920727746627801, + "lm_loss": 0.006317138671875, + "loss": 0.0074, + "step": 3121, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019206776218189242, + "lm_loss": 0.01483154296875, + "loss": 0.0068, + "step": 3122, + "total_loss": 0.01483154296875 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019206274818222628, + "lm_loss": 0.002288818359375, + "loss": 0.0063, + "step": 3123, + "total_loss": 0.002288818359375 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019205773266386437, + "lm_loss": 0.007476806640625, + "loss": 0.0062, + "step": 3124, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019205271562688946, + "lm_loss": 0.008056640625, + "loss": 0.0089, + "step": 3125, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019204769707138435, + "lm_loss": 0.01300048828125, + "loss": 0.0075, + "step": 3126, + "total_loss": 0.01300048828125 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019204267699743176, + "lm_loss": 0.01312255859375, + "loss": 0.0075, + "step": 3127, + "total_loss": 0.01312255859375 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019203765540511456, + "lm_loss": 0.00531005859375, + "loss": 0.0091, + "step": 3128, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019203263229451554, + "lm_loss": 0.012939453125, + "loss": 0.0081, + "step": 3129, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001920276076657176, + "lm_loss": 0.00909423828125, + "loss": 0.0087, + "step": 3130, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019202258151880357, + "lm_loss": 0.01007080078125, + "loss": 0.009, + "step": 3131, + "total_loss": 0.01007080078125 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019201755385385644, + "lm_loss": 0.00933837890625, + "loss": 0.0065, + "step": 3132, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019201252467095908, + "lm_loss": 0.007110595703125, + "loss": 0.0073, + "step": 3133, + "total_loss": 0.007110595703125 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019200749397019453, + "lm_loss": 0.010498046875, + "loss": 0.0088, + "step": 3134, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001920024617516457, + "lm_loss": 0.01385498046875, + "loss": 0.0085, + "step": 3135, + "total_loss": 0.01385498046875 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019199742801539565, + "lm_loss": 0.007232666015625, + "loss": 0.0068, + "step": 3136, + "total_loss": 0.007232666015625 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001919923927615274, + "lm_loss": 0.002838134765625, + "loss": 0.0076, + "step": 3137, + "total_loss": 0.002838134765625 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019198735599012398, + "lm_loss": 0.0036468505859375, + "loss": 0.0064, + "step": 3138, + "total_loss": 0.0036468505859375 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019198231770126855, + "lm_loss": 0.00872802734375, + "loss": 0.0074, + "step": 3139, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019197727789504418, + "lm_loss": 0.00469970703125, + "loss": 0.0074, + "step": 3140, + "total_loss": 0.00469970703125 + }, + { + "epoch": 1.28, + "learning_rate": 0.000191972236571534, + "lm_loss": 0.00628662109375, + "loss": 0.0077, + "step": 3141, + "total_loss": 0.00628662109375 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001919671937308212, + "lm_loss": 0.00787353515625, + "loss": 0.009, + "step": 3142, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019196214937298897, + "lm_loss": 0.0103759765625, + "loss": 0.0073, + "step": 3143, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001919571034981205, + "lm_loss": 0.0140380859375, + "loss": 0.0068, + "step": 3144, + "total_loss": 0.0140380859375 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019195205610629909, + "lm_loss": 0.0145263671875, + "loss": 0.0086, + "step": 3145, + "total_loss": 0.0145263671875 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019194700719760786, + "lm_loss": 0.01129150390625, + "loss": 0.0095, + "step": 3146, + "total_loss": 0.01129150390625 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019194195677213024, + "lm_loss": 0.0120849609375, + "loss": 0.0088, + "step": 3147, + "total_loss": 0.0120849609375 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001919369048299495, + "lm_loss": 0.007720947265625, + "loss": 0.0077, + "step": 3148, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019193185137114896, + "lm_loss": 0.00994873046875, + "loss": 0.0071, + "step": 3149, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019192679639581201, + "lm_loss": 0.004730224609375, + "loss": 0.0068, + "step": 3150, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.29, + "learning_rate": 0.000191921739904022, + "lm_loss": 0.005340576171875, + "loss": 0.0071, + "step": 3151, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019191668189586238, + "lm_loss": 0.0062255859375, + "loss": 0.0088, + "step": 3152, + "total_loss": 0.0062255859375 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019191162237141657, + "lm_loss": 0.009765625, + "loss": 0.0081, + "step": 3153, + "total_loss": 0.009765625 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019190656133076804, + "lm_loss": 0.0084228515625, + "loss": 0.0073, + "step": 3154, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001919014987740003, + "lm_loss": 0.006622314453125, + "loss": 0.0077, + "step": 3155, + "total_loss": 0.006622314453125 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019189643470119685, + "lm_loss": 0.00335693359375, + "loss": 0.0068, + "step": 3156, + "total_loss": 0.00335693359375 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019189136911244118, + "lm_loss": 0.00823974609375, + "loss": 0.0075, + "step": 3157, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001918863020078169, + "lm_loss": 0.0034027099609375, + "loss": 0.0075, + "step": 3158, + "total_loss": 0.0034027099609375 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001918812333874076, + "lm_loss": 0.006591796875, + "loss": 0.0085, + "step": 3159, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001918761632512969, + "lm_loss": 0.006927490234375, + "loss": 0.0094, + "step": 3160, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019187109159956843, + "lm_loss": 0.00836181640625, + "loss": 0.0079, + "step": 3161, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019186601843230582, + "lm_loss": 0.004913330078125, + "loss": 0.0071, + "step": 3162, + "total_loss": 0.004913330078125 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001918609437495928, + "lm_loss": 0.0089111328125, + "loss": 0.0078, + "step": 3163, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019185586755151305, + "lm_loss": 0.003662109375, + "loss": 0.0079, + "step": 3164, + "total_loss": 0.003662109375 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019185078983815034, + "lm_loss": 0.0034942626953125, + "loss": 0.0079, + "step": 3165, + "total_loss": 0.0034942626953125 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019184571060958846, + "lm_loss": 0.003204345703125, + "loss": 0.0067, + "step": 3166, + "total_loss": 0.003204345703125 + }, + { + "epoch": 1.29, + "learning_rate": 0.00019184062986591112, + "lm_loss": 0.0037078857421875, + "loss": 0.0091, + "step": 3167, + "total_loss": 0.0037078857421875 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019183554760720216, + "lm_loss": 0.005096435546875, + "loss": 0.0089, + "step": 3168, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019183046383354548, + "lm_loss": 0.00799560546875, + "loss": 0.0078, + "step": 3169, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019182537854502485, + "lm_loss": 0.00439453125, + "loss": 0.0077, + "step": 3170, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019182029174172425, + "lm_loss": 0.005859375, + "loss": 0.0072, + "step": 3171, + "total_loss": 0.005859375 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001918152034237275, + "lm_loss": 0.007537841796875, + "loss": 0.0078, + "step": 3172, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019181011359111866, + "lm_loss": 0.0029296875, + "loss": 0.0067, + "step": 3173, + "total_loss": 0.0029296875 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019180502224398156, + "lm_loss": 0.00518798828125, + "loss": 0.0068, + "step": 3174, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019179992938240026, + "lm_loss": 0.00970458984375, + "loss": 0.0087, + "step": 3175, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019179483500645877, + "lm_loss": 0.004425048828125, + "loss": 0.0076, + "step": 3176, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001917897391162411, + "lm_loss": 0.00225830078125, + "loss": 0.0093, + "step": 3177, + "total_loss": 0.00225830078125 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019178464171183132, + "lm_loss": 0.00872802734375, + "loss": 0.0075, + "step": 3178, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019177954279331357, + "lm_loss": 0.01287841796875, + "loss": 0.008, + "step": 3179, + "total_loss": 0.01287841796875 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001917744423607719, + "lm_loss": 0.00933837890625, + "loss": 0.0068, + "step": 3180, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001917693404142905, + "lm_loss": 0.00750732421875, + "loss": 0.0083, + "step": 3181, + "total_loss": 0.00750732421875 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001917642369539535, + "lm_loss": 0.01080322265625, + "loss": 0.0088, + "step": 3182, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001917591319798451, + "lm_loss": 0.0023040771484375, + "loss": 0.0065, + "step": 3183, + "total_loss": 0.0023040771484375 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019175402549204948, + "lm_loss": 0.002349853515625, + "loss": 0.0082, + "step": 3184, + "total_loss": 0.002349853515625 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019174891749065097, + "lm_loss": 0.0031280517578125, + "loss": 0.0067, + "step": 3185, + "total_loss": 0.0031280517578125 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001917438079757337, + "lm_loss": 0.01263427734375, + "loss": 0.0068, + "step": 3186, + "total_loss": 0.01263427734375 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019173869694738205, + "lm_loss": 0.002288818359375, + "loss": 0.0067, + "step": 3187, + "total_loss": 0.002288818359375 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019173358440568032, + "lm_loss": 0.005767822265625, + "loss": 0.0061, + "step": 3188, + "total_loss": 0.005767822265625 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019172847035071284, + "lm_loss": 0.0126953125, + "loss": 0.0069, + "step": 3189, + "total_loss": 0.0126953125 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019172335478256396, + "lm_loss": 0.01171875, + "loss": 0.0089, + "step": 3190, + "total_loss": 0.01171875 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019171823770131808, + "lm_loss": 0.00872802734375, + "loss": 0.008, + "step": 3191, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019171311910705962, + "lm_loss": 0.005157470703125, + "loss": 0.0088, + "step": 3192, + "total_loss": 0.005157470703125 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019170799899987303, + "lm_loss": 0.0034637451171875, + "loss": 0.0078, + "step": 3193, + "total_loss": 0.0034637451171875 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019170287737984274, + "lm_loss": 0.00439453125, + "loss": 0.0081, + "step": 3194, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019169775424705325, + "lm_loss": 0.007568359375, + "loss": 0.005, + "step": 3195, + "total_loss": 0.007568359375 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019169262960158908, + "lm_loss": 0.0196533203125, + "loss": 0.0084, + "step": 3196, + "total_loss": 0.0196533203125 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019168750344353475, + "lm_loss": 0.004302978515625, + "loss": 0.0065, + "step": 3197, + "total_loss": 0.004302978515625 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019168237577297484, + "lm_loss": 0.006591796875, + "loss": 0.0081, + "step": 3198, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019167724658999396, + "lm_loss": 0.00555419921875, + "loss": 0.0067, + "step": 3199, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019167211589467663, + "lm_loss": 0.0029449462890625, + "loss": 0.0072, + "step": 3200, + "total_loss": 0.0029449462890625 + }, + { + "epoch": 1.31, + "eval_lm_loss": 0.009713852778077126, + "eval_loss": 0.010098733939230442, + "eval_runtime": 44.0862, + "eval_samples_per_second": 22.683, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.009713852778077126, + "lm_loss": 0.000698089599609375, + "step": 3200, + "total_loss": 0.000698089599609375 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019166698368710763, + "lm_loss": 0.011474609375, + "loss": 0.0072, + "step": 3201, + "total_loss": 0.011474609375 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001916618499673715, + "lm_loss": 0.0098876953125, + "loss": 0.0092, + "step": 3202, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019165671473555298, + "lm_loss": 0.01348876953125, + "loss": 0.0083, + "step": 3203, + "total_loss": 0.01348876953125 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001916515779917368, + "lm_loss": 0.004150390625, + "loss": 0.0078, + "step": 3204, + "total_loss": 0.004150390625 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019164643973600763, + "lm_loss": 0.00677490234375, + "loss": 0.0077, + "step": 3205, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001916412999684503, + "lm_loss": 0.0096435546875, + "loss": 0.0082, + "step": 3206, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019163615868914957, + "lm_loss": 0.004669189453125, + "loss": 0.0067, + "step": 3207, + "total_loss": 0.004669189453125 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019163101589819026, + "lm_loss": 0.011962890625, + "loss": 0.0079, + "step": 3208, + "total_loss": 0.011962890625 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001916258715956572, + "lm_loss": 0.00830078125, + "loss": 0.0076, + "step": 3209, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019162072578163524, + "lm_loss": 0.009765625, + "loss": 0.0084, + "step": 3210, + "total_loss": 0.009765625 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001916155784562093, + "lm_loss": 0.011962890625, + "loss": 0.0086, + "step": 3211, + "total_loss": 0.011962890625 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019161042961946426, + "lm_loss": 0.00592041015625, + "loss": 0.0098, + "step": 3212, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019160527927148505, + "lm_loss": 0.0101318359375, + "loss": 0.0083, + "step": 3213, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019160012741235667, + "lm_loss": 0.0166015625, + "loss": 0.0096, + "step": 3214, + "total_loss": 0.0166015625 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001915949740421641, + "lm_loss": 0.005584716796875, + "loss": 0.0069, + "step": 3215, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019158981916099233, + "lm_loss": 0.01318359375, + "loss": 0.0095, + "step": 3216, + "total_loss": 0.01318359375 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001915846627689264, + "lm_loss": 0.00830078125, + "loss": 0.008, + "step": 3217, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019157950486605135, + "lm_loss": 0.00156402587890625, + "loss": 0.0082, + "step": 3218, + "total_loss": 0.00156402587890625 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019157434545245233, + "lm_loss": 0.007537841796875, + "loss": 0.0069, + "step": 3219, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019156918452821443, + "lm_loss": 0.00848388671875, + "loss": 0.0093, + "step": 3220, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019156402209342274, + "lm_loss": 0.0093994140625, + "loss": 0.0076, + "step": 3221, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019155885814816249, + "lm_loss": 0.006591796875, + "loss": 0.0083, + "step": 3222, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019155369269251878, + "lm_loss": 0.005706787109375, + "loss": 0.0084, + "step": 3223, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001915485257265769, + "lm_loss": 0.0068359375, + "loss": 0.0078, + "step": 3224, + "total_loss": 0.0068359375 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019154335725042203, + "lm_loss": 0.01104736328125, + "loss": 0.008, + "step": 3225, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001915381872641395, + "lm_loss": 0.005859375, + "loss": 0.0074, + "step": 3226, + "total_loss": 0.005859375 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001915330157678145, + "lm_loss": 0.00701904296875, + "loss": 0.0072, + "step": 3227, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001915278427615324, + "lm_loss": 0.006378173828125, + "loss": 0.0083, + "step": 3228, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019152266824537853, + "lm_loss": 0.008544921875, + "loss": 0.0075, + "step": 3229, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019151749221943827, + "lm_loss": 0.007568359375, + "loss": 0.008, + "step": 3230, + "total_loss": 0.007568359375 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019151231468379697, + "lm_loss": 0.01043701171875, + "loss": 0.0076, + "step": 3231, + "total_loss": 0.01043701171875 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019150713563854005, + "lm_loss": 0.007110595703125, + "loss": 0.0058, + "step": 3232, + "total_loss": 0.007110595703125 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019150195508375295, + "lm_loss": 0.00616455078125, + "loss": 0.0088, + "step": 3233, + "total_loss": 0.00616455078125 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019149677301952113, + "lm_loss": 0.00921630859375, + "loss": 0.0083, + "step": 3234, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019149158944593012, + "lm_loss": 0.006500244140625, + "loss": 0.0083, + "step": 3235, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019148640436306532, + "lm_loss": 0.009765625, + "loss": 0.0078, + "step": 3236, + "total_loss": 0.009765625 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019148121777101234, + "lm_loss": 0.00421142578125, + "loss": 0.0062, + "step": 3237, + "total_loss": 0.00421142578125 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019147602966985674, + "lm_loss": 0.018310546875, + "loss": 0.0074, + "step": 3238, + "total_loss": 0.018310546875 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019147084005968407, + "lm_loss": 0.0062255859375, + "loss": 0.0091, + "step": 3239, + "total_loss": 0.0062255859375 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019146564894058, + "lm_loss": 0.006683349609375, + "loss": 0.0081, + "step": 3240, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001914604563126301, + "lm_loss": 0.01092529296875, + "loss": 0.0085, + "step": 3241, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019145526217592007, + "lm_loss": 0.00830078125, + "loss": 0.0067, + "step": 3242, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019145006653053558, + "lm_loss": 0.004974365234375, + "loss": 0.0086, + "step": 3243, + "total_loss": 0.004974365234375 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001914448693765623, + "lm_loss": 0.00872802734375, + "loss": 0.0077, + "step": 3244, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019143967071408606, + "lm_loss": 0.00714111328125, + "loss": 0.0083, + "step": 3245, + "total_loss": 0.00714111328125 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019143447054319253, + "lm_loss": 0.00970458984375, + "loss": 0.0073, + "step": 3246, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019142926886396753, + "lm_loss": 0.008056640625, + "loss": 0.0077, + "step": 3247, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019142406567649686, + "lm_loss": 0.005828857421875, + "loss": 0.008, + "step": 3248, + "total_loss": 0.005828857421875 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019141886098086636, + "lm_loss": 0.008544921875, + "loss": 0.0077, + "step": 3249, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019141365477716188, + "lm_loss": 0.004119873046875, + "loss": 0.0063, + "step": 3250, + "total_loss": 0.004119873046875 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019140844706546931, + "lm_loss": 0.01263427734375, + "loss": 0.0073, + "step": 3251, + "total_loss": 0.01263427734375 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019140323784587454, + "lm_loss": 0.00836181640625, + "loss": 0.007, + "step": 3252, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019139802711846357, + "lm_loss": 0.0037689208984375, + "loss": 0.0068, + "step": 3253, + "total_loss": 0.0037689208984375 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019139281488332225, + "lm_loss": 0.00933837890625, + "loss": 0.0078, + "step": 3254, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019138760114053666, + "lm_loss": 0.00811767578125, + "loss": 0.0071, + "step": 3255, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019138238589019274, + "lm_loss": 0.0023956298828125, + "loss": 0.0104, + "step": 3256, + "total_loss": 0.0023956298828125 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019137716913237657, + "lm_loss": 0.0034637451171875, + "loss": 0.0092, + "step": 3257, + "total_loss": 0.0034637451171875 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019137195086717416, + "lm_loss": 0.01177978515625, + "loss": 0.0081, + "step": 3258, + "total_loss": 0.01177978515625 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019136673109467167, + "lm_loss": 0.00640869140625, + "loss": 0.0072, + "step": 3259, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019136150981495512, + "lm_loss": 0.01031494140625, + "loss": 0.0081, + "step": 3260, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019135628702811065, + "lm_loss": 0.0079345703125, + "loss": 0.0068, + "step": 3261, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001913510627342245, + "lm_loss": 0.006500244140625, + "loss": 0.0081, + "step": 3262, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019134583693338275, + "lm_loss": 0.002899169921875, + "loss": 0.0078, + "step": 3263, + "total_loss": 0.002899169921875 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001913406096256717, + "lm_loss": 0.00860595703125, + "loss": 0.0078, + "step": 3264, + "total_loss": 0.00860595703125 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019133538081117748, + "lm_loss": 0.00909423828125, + "loss": 0.0073, + "step": 3265, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019133015048998644, + "lm_loss": 0.0091552734375, + "loss": 0.0072, + "step": 3266, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019132491866218478, + "lm_loss": 0.01080322265625, + "loss": 0.0097, + "step": 3267, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019131968532785888, + "lm_loss": 0.01092529296875, + "loss": 0.007, + "step": 3268, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019131445048709505, + "lm_loss": 0.00909423828125, + "loss": 0.0069, + "step": 3269, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019130921413997961, + "lm_loss": 0.01556396484375, + "loss": 0.0082, + "step": 3270, + "total_loss": 0.01556396484375 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019130397628659896, + "lm_loss": 0.0074462890625, + "loss": 0.009, + "step": 3271, + "total_loss": 0.0074462890625 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912987369270395, + "lm_loss": 0.003143310546875, + "loss": 0.0079, + "step": 3272, + "total_loss": 0.003143310546875 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912934960613877, + "lm_loss": 0.005401611328125, + "loss": 0.0072, + "step": 3273, + "total_loss": 0.005401611328125 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019128825368973, + "lm_loss": 0.003997802734375, + "loss": 0.0081, + "step": 3274, + "total_loss": 0.003997802734375 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912830098121528, + "lm_loss": 0.0087890625, + "loss": 0.0064, + "step": 3275, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912777644287427, + "lm_loss": 0.00506591796875, + "loss": 0.0078, + "step": 3276, + "total_loss": 0.00506591796875 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912725175395862, + "lm_loss": 0.01348876953125, + "loss": 0.0075, + "step": 3277, + "total_loss": 0.01348876953125 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019126726914476985, + "lm_loss": 0.00994873046875, + "loss": 0.0065, + "step": 3278, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019126201924438022, + "lm_loss": 0.0068359375, + "loss": 0.0076, + "step": 3279, + "total_loss": 0.0068359375 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019125676783850396, + "lm_loss": 0.01519775390625, + "loss": 0.0074, + "step": 3280, + "total_loss": 0.01519775390625 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019125151492722762, + "lm_loss": 0.01220703125, + "loss": 0.0084, + "step": 3281, + "total_loss": 0.01220703125 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912462605106379, + "lm_loss": 0.009521484375, + "loss": 0.0087, + "step": 3282, + "total_loss": 0.009521484375 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912410045888215, + "lm_loss": 0.013427734375, + "loss": 0.0092, + "step": 3283, + "total_loss": 0.013427734375 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912357471618651, + "lm_loss": 0.0034332275390625, + "loss": 0.006, + "step": 3284, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912304882298554, + "lm_loss": 0.00799560546875, + "loss": 0.0071, + "step": 3285, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912252277928792, + "lm_loss": 0.00946044921875, + "loss": 0.0101, + "step": 3286, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019121996585102328, + "lm_loss": 0.005645751953125, + "loss": 0.0075, + "step": 3287, + "total_loss": 0.005645751953125 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912147024043744, + "lm_loss": 0.00701904296875, + "loss": 0.0067, + "step": 3288, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001912094374530194, + "lm_loss": 0.0089111328125, + "loss": 0.0092, + "step": 3289, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019120417099704516, + "lm_loss": 0.005401611328125, + "loss": 0.0072, + "step": 3290, + "total_loss": 0.005401611328125 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001911989030365385, + "lm_loss": 0.0162353515625, + "loss": 0.0088, + "step": 3291, + "total_loss": 0.0162353515625 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001911936335715864, + "lm_loss": 0.007293701171875, + "loss": 0.0064, + "step": 3292, + "total_loss": 0.007293701171875 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019118836260227572, + "lm_loss": 0.0159912109375, + "loss": 0.009, + "step": 3293, + "total_loss": 0.0159912109375 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019118309012869348, + "lm_loss": 0.0086669921875, + "loss": 0.008, + "step": 3294, + "total_loss": 0.0086669921875 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019117781615092655, + "lm_loss": 0.0079345703125, + "loss": 0.0058, + "step": 3295, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019117254066906204, + "lm_loss": 0.006103515625, + "loss": 0.0058, + "step": 3296, + "total_loss": 0.006103515625 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019116726368318692, + "lm_loss": 0.002899169921875, + "loss": 0.0062, + "step": 3297, + "total_loss": 0.002899169921875 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019116198519338826, + "lm_loss": 0.007476806640625, + "loss": 0.008, + "step": 3298, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001911567051997531, + "lm_loss": 0.01092529296875, + "loss": 0.0087, + "step": 3299, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001911514237023686, + "lm_loss": 0.005218505859375, + "loss": 0.0083, + "step": 3300, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.35, + "eval_lm_loss": 0.0095286900177598, + "eval_loss": 0.00984802283346653, + "eval_runtime": 44.1967, + "eval_samples_per_second": 22.626, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.0095286900177598, + "lm_loss": 0.00159454345703125, + "step": 3300, + "total_loss": 0.00159454345703125 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019114614070132185, + "lm_loss": 0.005218505859375, + "loss": 0.0066, + "step": 3301, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019114085619669998, + "lm_loss": 0.004974365234375, + "loss": 0.0069, + "step": 3302, + "total_loss": 0.004974365234375 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019113557018859024, + "lm_loss": 0.009033203125, + "loss": 0.0073, + "step": 3303, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019113028267707974, + "lm_loss": 0.01092529296875, + "loss": 0.0082, + "step": 3304, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001911249936622557, + "lm_loss": 0.0074462890625, + "loss": 0.0066, + "step": 3305, + "total_loss": 0.0074462890625 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001911197031442055, + "lm_loss": 0.01275634765625, + "loss": 0.0082, + "step": 3306, + "total_loss": 0.01275634765625 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019111441112301626, + "lm_loss": 0.01153564453125, + "loss": 0.0087, + "step": 3307, + "total_loss": 0.01153564453125 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019110911759877538, + "lm_loss": 0.00909423828125, + "loss": 0.0077, + "step": 3308, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019110382257157013, + "lm_loss": 0.006866455078125, + "loss": 0.006, + "step": 3309, + "total_loss": 0.006866455078125 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019109852604148788, + "lm_loss": 0.003814697265625, + "loss": 0.009, + "step": 3310, + "total_loss": 0.003814697265625 + }, + { + "epoch": 1.35, + "learning_rate": 0.000191093228008616, + "lm_loss": 0.00567626953125, + "loss": 0.0066, + "step": 3311, + "total_loss": 0.00567626953125 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019108792847304188, + "lm_loss": 0.0125732421875, + "loss": 0.0071, + "step": 3312, + "total_loss": 0.0125732421875 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019108262743485299, + "lm_loss": 0.004150390625, + "loss": 0.0095, + "step": 3313, + "total_loss": 0.004150390625 + }, + { + "epoch": 1.35, + "learning_rate": 0.0001910773248941367, + "lm_loss": 0.01043701171875, + "loss": 0.0074, + "step": 3314, + "total_loss": 0.01043701171875 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001910720208509805, + "lm_loss": 0.0032196044921875, + "loss": 0.0084, + "step": 3315, + "total_loss": 0.0032196044921875 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019106671530547196, + "lm_loss": 0.010986328125, + "loss": 0.007, + "step": 3316, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019106140825769853, + "lm_loss": 0.006103515625, + "loss": 0.0073, + "step": 3317, + "total_loss": 0.006103515625 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019105609970774775, + "lm_loss": 0.00396728515625, + "loss": 0.0065, + "step": 3318, + "total_loss": 0.00396728515625 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019105078965570726, + "lm_loss": 0.006805419921875, + "loss": 0.0079, + "step": 3319, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019104547810166458, + "lm_loss": 0.004180908203125, + "loss": 0.0084, + "step": 3320, + "total_loss": 0.004180908203125 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001910401650457074, + "lm_loss": 0.006103515625, + "loss": 0.0071, + "step": 3321, + "total_loss": 0.006103515625 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001910348504879233, + "lm_loss": 0.0030517578125, + "loss": 0.008, + "step": 3322, + "total_loss": 0.0030517578125 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001910295344284, + "lm_loss": 0.007568359375, + "loss": 0.0076, + "step": 3323, + "total_loss": 0.007568359375 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019102421686722517, + "lm_loss": 0.0198974609375, + "loss": 0.0075, + "step": 3324, + "total_loss": 0.0198974609375 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019101889780448654, + "lm_loss": 0.003631591796875, + "loss": 0.0079, + "step": 3325, + "total_loss": 0.003631591796875 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019101357724027183, + "lm_loss": 0.005859375, + "loss": 0.0084, + "step": 3326, + "total_loss": 0.005859375 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019100825517466886, + "lm_loss": 0.007476806640625, + "loss": 0.0096, + "step": 3327, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019100293160776536, + "lm_loss": 0.005615234375, + "loss": 0.0078, + "step": 3328, + "total_loss": 0.005615234375 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019099760653964922, + "lm_loss": 0.006072998046875, + "loss": 0.007, + "step": 3329, + "total_loss": 0.006072998046875 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019099227997040826, + "lm_loss": 0.00872802734375, + "loss": 0.0085, + "step": 3330, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019098695190013028, + "lm_loss": 0.00970458984375, + "loss": 0.007, + "step": 3331, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001909816223289033, + "lm_loss": 0.01129150390625, + "loss": 0.0074, + "step": 3332, + "total_loss": 0.01129150390625 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019097629125681514, + "lm_loss": 0.00396728515625, + "loss": 0.0077, + "step": 3333, + "total_loss": 0.00396728515625 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019097095868395378, + "lm_loss": 0.01019287109375, + "loss": 0.0064, + "step": 3334, + "total_loss": 0.01019287109375 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019096562461040718, + "lm_loss": 0.01116943359375, + "loss": 0.0067, + "step": 3335, + "total_loss": 0.01116943359375 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001909602890362633, + "lm_loss": 0.0093994140625, + "loss": 0.0078, + "step": 3336, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019095495196161025, + "lm_loss": 0.0078125, + "loss": 0.0071, + "step": 3337, + "total_loss": 0.0078125 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019094961338653597, + "lm_loss": 0.012451171875, + "loss": 0.0082, + "step": 3338, + "total_loss": 0.012451171875 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019094427331112855, + "lm_loss": 0.00482177734375, + "loss": 0.0069, + "step": 3339, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019093893173547613, + "lm_loss": 0.010986328125, + "loss": 0.0082, + "step": 3340, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001909335886596668, + "lm_loss": 0.00186920166015625, + "loss": 0.0079, + "step": 3341, + "total_loss": 0.00186920166015625 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019092824408378867, + "lm_loss": 0.0040283203125, + "loss": 0.0071, + "step": 3342, + "total_loss": 0.0040283203125 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019092289800793, + "lm_loss": 0.004119873046875, + "loss": 0.0063, + "step": 3343, + "total_loss": 0.004119873046875 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019091755043217887, + "lm_loss": 0.007049560546875, + "loss": 0.0063, + "step": 3344, + "total_loss": 0.007049560546875 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019091220135662355, + "lm_loss": 0.01007080078125, + "loss": 0.0097, + "step": 3345, + "total_loss": 0.01007080078125 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019090685078135223, + "lm_loss": 0.007781982421875, + "loss": 0.0096, + "step": 3346, + "total_loss": 0.007781982421875 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001909014987064533, + "lm_loss": 0.006561279296875, + "loss": 0.0068, + "step": 3347, + "total_loss": 0.006561279296875 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019089614513201488, + "lm_loss": 0.005462646484375, + "loss": 0.0072, + "step": 3348, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001908907900581254, + "lm_loss": 0.0096435546875, + "loss": 0.0061, + "step": 3349, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019088543348487317, + "lm_loss": 0.01092529296875, + "loss": 0.0082, + "step": 3350, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019088007541234655, + "lm_loss": 0.00592041015625, + "loss": 0.0073, + "step": 3351, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019087471584063392, + "lm_loss": 0.005218505859375, + "loss": 0.008, + "step": 3352, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001908693547698237, + "lm_loss": 0.0024566650390625, + "loss": 0.008, + "step": 3353, + "total_loss": 0.0024566650390625 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019086399220000436, + "lm_loss": 0.005950927734375, + "loss": 0.0075, + "step": 3354, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001908586281312643, + "lm_loss": 0.00592041015625, + "loss": 0.0068, + "step": 3355, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019085326256369206, + "lm_loss": 0.00775146484375, + "loss": 0.0065, + "step": 3356, + "total_loss": 0.00775146484375 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001908478954973761, + "lm_loss": 0.0045166015625, + "loss": 0.0079, + "step": 3357, + "total_loss": 0.0045166015625 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019084252693240506, + "lm_loss": 0.004852294921875, + "loss": 0.0083, + "step": 3358, + "total_loss": 0.004852294921875 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019083715686886736, + "lm_loss": 0.002838134765625, + "loss": 0.0079, + "step": 3359, + "total_loss": 0.002838134765625 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001908317853068517, + "lm_loss": 0.0103759765625, + "loss": 0.0074, + "step": 3360, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019082641224644662, + "lm_loss": 0.00921630859375, + "loss": 0.0059, + "step": 3361, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001908210376877408, + "lm_loss": 0.00897216796875, + "loss": 0.0064, + "step": 3362, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019081566163082285, + "lm_loss": 0.0115966796875, + "loss": 0.0081, + "step": 3363, + "total_loss": 0.0115966796875 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019081028407578153, + "lm_loss": 0.00799560546875, + "loss": 0.0071, + "step": 3364, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019080490502270552, + "lm_loss": 0.004364013671875, + "loss": 0.0072, + "step": 3365, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001907995244716835, + "lm_loss": 0.01019287109375, + "loss": 0.008, + "step": 3366, + "total_loss": 0.01019287109375 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019079414242280428, + "lm_loss": 0.01385498046875, + "loss": 0.0112, + "step": 3367, + "total_loss": 0.01385498046875 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019078875887615666, + "lm_loss": 0.0034637451171875, + "loss": 0.0078, + "step": 3368, + "total_loss": 0.0034637451171875 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019078337383182943, + "lm_loss": 0.0093994140625, + "loss": 0.0077, + "step": 3369, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019077798728991138, + "lm_loss": 0.00518798828125, + "loss": 0.0059, + "step": 3370, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019077259925049146, + "lm_loss": 0.00946044921875, + "loss": 0.0066, + "step": 3371, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019076720971365844, + "lm_loss": 0.0166015625, + "loss": 0.0062, + "step": 3372, + "total_loss": 0.0166015625 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019076181867950132, + "lm_loss": 0.006500244140625, + "loss": 0.0086, + "step": 3373, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.38, + "learning_rate": 0.000190756426148109, + "lm_loss": 0.002532958984375, + "loss": 0.0073, + "step": 3374, + "total_loss": 0.002532958984375 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019075103211957045, + "lm_loss": 0.01141357421875, + "loss": 0.0082, + "step": 3375, + "total_loss": 0.01141357421875 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001907456365939746, + "lm_loss": 0.01031494140625, + "loss": 0.0081, + "step": 3376, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019074023957141051, + "lm_loss": 0.00531005859375, + "loss": 0.0074, + "step": 3377, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001907348410519672, + "lm_loss": 0.009033203125, + "loss": 0.0069, + "step": 3378, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019072944103573373, + "lm_loss": 0.00823974609375, + "loss": 0.0074, + "step": 3379, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019072403952279915, + "lm_loss": 0.007080078125, + "loss": 0.0075, + "step": 3380, + "total_loss": 0.007080078125 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001907186365132526, + "lm_loss": 0.00714111328125, + "loss": 0.0076, + "step": 3381, + "total_loss": 0.00714111328125 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001907132320071832, + "lm_loss": 0.0048828125, + "loss": 0.0078, + "step": 3382, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001907078260046801, + "lm_loss": 0.01397705078125, + "loss": 0.0083, + "step": 3383, + "total_loss": 0.01397705078125 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019070241850583245, + "lm_loss": 0.002166748046875, + "loss": 0.0079, + "step": 3384, + "total_loss": 0.002166748046875 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001906970095107295, + "lm_loss": 0.006927490234375, + "loss": 0.0074, + "step": 3385, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.38, + "learning_rate": 0.00019069159901946047, + "lm_loss": 0.01092529296875, + "loss": 0.0073, + "step": 3386, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001906861870321146, + "lm_loss": 0.0115966796875, + "loss": 0.0084, + "step": 3387, + "total_loss": 0.0115966796875 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019068077354878117, + "lm_loss": 0.00640869140625, + "loss": 0.0061, + "step": 3388, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001906753585695495, + "lm_loss": 0.005523681640625, + "loss": 0.0092, + "step": 3389, + "total_loss": 0.005523681640625 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019066994209450888, + "lm_loss": 0.00927734375, + "loss": 0.0066, + "step": 3390, + "total_loss": 0.00927734375 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001906645241237487, + "lm_loss": 0.007110595703125, + "loss": 0.0084, + "step": 3391, + "total_loss": 0.007110595703125 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001906591046573583, + "lm_loss": 0.005126953125, + "loss": 0.0088, + "step": 3392, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019065368369542714, + "lm_loss": 0.01495361328125, + "loss": 0.0082, + "step": 3393, + "total_loss": 0.01495361328125 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019064826123804456, + "lm_loss": 0.004302978515625, + "loss": 0.0085, + "step": 3394, + "total_loss": 0.004302978515625 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019064283728530007, + "lm_loss": 0.004608154296875, + "loss": 0.0087, + "step": 3395, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019063741183728317, + "lm_loss": 0.005462646484375, + "loss": 0.0074, + "step": 3396, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019063198489408328, + "lm_loss": 0.0032806396484375, + "loss": 0.008, + "step": 3397, + "total_loss": 0.0032806396484375 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019062655645579, + "lm_loss": 0.006988525390625, + "loss": 0.0069, + "step": 3398, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019062112652249282, + "lm_loss": 0.00958251953125, + "loss": 0.006, + "step": 3399, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019061569509428136, + "lm_loss": 0.013427734375, + "loss": 0.0068, + "step": 3400, + "total_loss": 0.013427734375 + }, + { + "epoch": 1.39, + "eval_lm_loss": 0.009199698455631733, + "eval_loss": 0.009643125347793102, + "eval_runtime": 43.9104, + "eval_samples_per_second": 22.774, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009199698455631733, + "lm_loss": 0.00109100341796875, + "step": 3400, + "total_loss": 0.00109100341796875 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001906102621712452, + "lm_loss": 0.01165771484375, + "loss": 0.0086, + "step": 3401, + "total_loss": 0.01165771484375 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019060482775347395, + "lm_loss": 0.0111083984375, + "loss": 0.007, + "step": 3402, + "total_loss": 0.0111083984375 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019059939184105725, + "lm_loss": 0.01287841796875, + "loss": 0.008, + "step": 3403, + "total_loss": 0.01287841796875 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019059395443408482, + "lm_loss": 0.012451171875, + "loss": 0.0071, + "step": 3404, + "total_loss": 0.012451171875 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019058851553264633, + "lm_loss": 0.006317138671875, + "loss": 0.0077, + "step": 3405, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019058307513683152, + "lm_loss": 0.01416015625, + "loss": 0.0099, + "step": 3406, + "total_loss": 0.01416015625 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019057763324673008, + "lm_loss": 0.004119873046875, + "loss": 0.0071, + "step": 3407, + "total_loss": 0.004119873046875 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019057218986243184, + "lm_loss": 0.0086669921875, + "loss": 0.0086, + "step": 3408, + "total_loss": 0.0086669921875 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019056674498402657, + "lm_loss": 0.0045166015625, + "loss": 0.0074, + "step": 3409, + "total_loss": 0.0045166015625 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001905612986116041, + "lm_loss": 0.00726318359375, + "loss": 0.0059, + "step": 3410, + "total_loss": 0.00726318359375 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001905558507452543, + "lm_loss": 0.0081787109375, + "loss": 0.0072, + "step": 3411, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019055040138506697, + "lm_loss": 0.012451171875, + "loss": 0.0082, + "step": 3412, + "total_loss": 0.012451171875 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019054495053113203, + "lm_loss": 0.0076904296875, + "loss": 0.0084, + "step": 3413, + "total_loss": 0.0076904296875 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019053949818353944, + "lm_loss": 0.00537109375, + "loss": 0.0063, + "step": 3414, + "total_loss": 0.00537109375 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019053404434237915, + "lm_loss": 0.01556396484375, + "loss": 0.0085, + "step": 3415, + "total_loss": 0.01556396484375 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019052858900774102, + "lm_loss": 0.006744384765625, + "loss": 0.0079, + "step": 3416, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019052313217971518, + "lm_loss": 0.01287841796875, + "loss": 0.0069, + "step": 3417, + "total_loss": 0.01287841796875 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019051767385839155, + "lm_loss": 0.005859375, + "loss": 0.0079, + "step": 3418, + "total_loss": 0.005859375 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001905122140438602, + "lm_loss": 0.01409912109375, + "loss": 0.0089, + "step": 3419, + "total_loss": 0.01409912109375 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019050675273621126, + "lm_loss": 0.01031494140625, + "loss": 0.006, + "step": 3420, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019050128993553474, + "lm_loss": 0.006500244140625, + "loss": 0.0082, + "step": 3421, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019049582564192077, + "lm_loss": 0.00640869140625, + "loss": 0.007, + "step": 3422, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001904903598554595, + "lm_loss": 0.012451171875, + "loss": 0.0061, + "step": 3423, + "total_loss": 0.012451171875 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019048489257624112, + "lm_loss": 0.01361083984375, + "loss": 0.0097, + "step": 3424, + "total_loss": 0.01361083984375 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019047942380435578, + "lm_loss": 0.011962890625, + "loss": 0.008, + "step": 3425, + "total_loss": 0.011962890625 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019047395353989372, + "lm_loss": 0.013427734375, + "loss": 0.0074, + "step": 3426, + "total_loss": 0.013427734375 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001904684817829452, + "lm_loss": 0.00274658203125, + "loss": 0.0074, + "step": 3427, + "total_loss": 0.00274658203125 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001904630085336004, + "lm_loss": 0.01104736328125, + "loss": 0.0096, + "step": 3428, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019045753379194968, + "lm_loss": 0.007659912109375, + "loss": 0.0077, + "step": 3429, + "total_loss": 0.007659912109375 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019045205755808335, + "lm_loss": 0.0078125, + "loss": 0.0079, + "step": 3430, + "total_loss": 0.0078125 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019044657983209175, + "lm_loss": 0.0084228515625, + "loss": 0.0071, + "step": 3431, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019044110061406522, + "lm_loss": 0.008544921875, + "loss": 0.0089, + "step": 3432, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019043561990409412, + "lm_loss": 0.00482177734375, + "loss": 0.008, + "step": 3433, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019043013770226894, + "lm_loss": 0.01043701171875, + "loss": 0.0089, + "step": 3434, + "total_loss": 0.01043701171875 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019042465400868005, + "lm_loss": 0.014404296875, + "loss": 0.0081, + "step": 3435, + "total_loss": 0.014404296875 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019041916882341794, + "lm_loss": 0.005462646484375, + "loss": 0.0089, + "step": 3436, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001904136821465731, + "lm_loss": 0.0113525390625, + "loss": 0.0082, + "step": 3437, + "total_loss": 0.0113525390625 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019040819397823596, + "lm_loss": 0.0074462890625, + "loss": 0.0089, + "step": 3438, + "total_loss": 0.0074462890625 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001904027043184972, + "lm_loss": 0.00531005859375, + "loss": 0.0071, + "step": 3439, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019039721316744726, + "lm_loss": 0.0028228759765625, + "loss": 0.0068, + "step": 3440, + "total_loss": 0.0028228759765625 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001903917205251768, + "lm_loss": 0.007720947265625, + "loss": 0.0088, + "step": 3441, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019038622639177635, + "lm_loss": 0.0062255859375, + "loss": 0.0069, + "step": 3442, + "total_loss": 0.0062255859375 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019038073076733665, + "lm_loss": 0.005828857421875, + "loss": 0.0081, + "step": 3443, + "total_loss": 0.005828857421875 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019037523365194825, + "lm_loss": 0.00714111328125, + "loss": 0.0085, + "step": 3444, + "total_loss": 0.00714111328125 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019036973504570187, + "lm_loss": 0.0093994140625, + "loss": 0.0067, + "step": 3445, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001903642349486882, + "lm_loss": 0.00531005859375, + "loss": 0.0063, + "step": 3446, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019035873336099805, + "lm_loss": 0.01129150390625, + "loss": 0.0084, + "step": 3447, + "total_loss": 0.01129150390625 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001903532302827221, + "lm_loss": 0.005462646484375, + "loss": 0.0064, + "step": 3448, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019034772571395117, + "lm_loss": 0.00531005859375, + "loss": 0.0077, + "step": 3449, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.41, + "learning_rate": 0.000190342219654776, + "lm_loss": 0.00482177734375, + "loss": 0.0062, + "step": 3450, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019033671210528749, + "lm_loss": 0.00640869140625, + "loss": 0.0071, + "step": 3451, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019033120306557647, + "lm_loss": 0.00811767578125, + "loss": 0.0075, + "step": 3452, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019032569253573382, + "lm_loss": 0.004058837890625, + "loss": 0.0079, + "step": 3453, + "total_loss": 0.004058837890625 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019032018051585043, + "lm_loss": 0.01153564453125, + "loss": 0.0078, + "step": 3454, + "total_loss": 0.01153564453125 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019031466700601725, + "lm_loss": 0.00933837890625, + "loss": 0.0075, + "step": 3455, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019030915200632522, + "lm_loss": 0.0054931640625, + "loss": 0.0068, + "step": 3456, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019030363551686533, + "lm_loss": 0.00506591796875, + "loss": 0.0064, + "step": 3457, + "total_loss": 0.00506591796875 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019029811753772856, + "lm_loss": 0.01031494140625, + "loss": 0.0079, + "step": 3458, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019029259806900594, + "lm_loss": 0.005584716796875, + "loss": 0.0073, + "step": 3459, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.41, + "learning_rate": 0.00019028707711078854, + "lm_loss": 0.007110595703125, + "loss": 0.0083, + "step": 3460, + "total_loss": 0.007110595703125 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001902815546631674, + "lm_loss": 0.002899169921875, + "loss": 0.0075, + "step": 3461, + "total_loss": 0.002899169921875 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019027603072623367, + "lm_loss": 0.004547119140625, + "loss": 0.007, + "step": 3462, + "total_loss": 0.004547119140625 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019027050530007845, + "lm_loss": 0.0020294189453125, + "loss": 0.0068, + "step": 3463, + "total_loss": 0.0020294189453125 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019026497838479288, + "lm_loss": 0.01312255859375, + "loss": 0.0072, + "step": 3464, + "total_loss": 0.01312255859375 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019025944998046814, + "lm_loss": 0.0087890625, + "loss": 0.0076, + "step": 3465, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019025392008719543, + "lm_loss": 0.0023040771484375, + "loss": 0.0058, + "step": 3466, + "total_loss": 0.0023040771484375 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019024838870506598, + "lm_loss": 0.0098876953125, + "loss": 0.0077, + "step": 3467, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019024285583417103, + "lm_loss": 0.005889892578125, + "loss": 0.0081, + "step": 3468, + "total_loss": 0.005889892578125 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019023732147460185, + "lm_loss": 0.00433349609375, + "loss": 0.0068, + "step": 3469, + "total_loss": 0.00433349609375 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019023178562644973, + "lm_loss": 0.0111083984375, + "loss": 0.0109, + "step": 3470, + "total_loss": 0.0111083984375 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019022624828980602, + "lm_loss": 0.006683349609375, + "loss": 0.0077, + "step": 3471, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019022070946476204, + "lm_loss": 0.0026397705078125, + "loss": 0.007, + "step": 3472, + "total_loss": 0.0026397705078125 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019021516915140914, + "lm_loss": 0.007659912109375, + "loss": 0.0061, + "step": 3473, + "total_loss": 0.007659912109375 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019020962734983874, + "lm_loss": 0.00933837890625, + "loss": 0.0069, + "step": 3474, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019020408406014228, + "lm_loss": 0.0072021484375, + "loss": 0.0061, + "step": 3475, + "total_loss": 0.0072021484375 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001901985392824112, + "lm_loss": 0.00872802734375, + "loss": 0.0071, + "step": 3476, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001901929930167369, + "lm_loss": 0.0107421875, + "loss": 0.008, + "step": 3477, + "total_loss": 0.0107421875 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019018744526321096, + "lm_loss": 0.00897216796875, + "loss": 0.0069, + "step": 3478, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019018189602192486, + "lm_loss": 0.005950927734375, + "loss": 0.0082, + "step": 3479, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019017634529297012, + "lm_loss": 0.006378173828125, + "loss": 0.0077, + "step": 3480, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019017079307643835, + "lm_loss": 0.0101318359375, + "loss": 0.0068, + "step": 3481, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.42, + "learning_rate": 0.0001901652393724211, + "lm_loss": 0.0079345703125, + "loss": 0.0055, + "step": 3482, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019015968418101004, + "lm_loss": 0.005859375, + "loss": 0.0079, + "step": 3483, + "total_loss": 0.005859375 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019015412750229675, + "lm_loss": 0.00872802734375, + "loss": 0.0093, + "step": 3484, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.42, + "learning_rate": 0.00019014856933637295, + "lm_loss": 0.006500244140625, + "loss": 0.0088, + "step": 3485, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019014300968333027, + "lm_loss": 0.00341796875, + "loss": 0.0081, + "step": 3486, + "total_loss": 0.00341796875 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001901374485432605, + "lm_loss": 0.00897216796875, + "loss": 0.0082, + "step": 3487, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001901318859162553, + "lm_loss": 0.007720947265625, + "loss": 0.0087, + "step": 3488, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019012632180240646, + "lm_loss": 0.00445556640625, + "loss": 0.0078, + "step": 3489, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001901207562018058, + "lm_loss": 0.00823974609375, + "loss": 0.0071, + "step": 3490, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001901151891145451, + "lm_loss": 0.0029449462890625, + "loss": 0.0067, + "step": 3491, + "total_loss": 0.0029449462890625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019010962054071616, + "lm_loss": 0.00848388671875, + "loss": 0.0063, + "step": 3492, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019010405048041094, + "lm_loss": 0.00811767578125, + "loss": 0.0065, + "step": 3493, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019009847893372122, + "lm_loss": 0.004486083984375, + "loss": 0.0064, + "step": 3494, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.43, + "learning_rate": 0.000190092905900739, + "lm_loss": 0.008544921875, + "loss": 0.0089, + "step": 3495, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019008733138155615, + "lm_loss": 0.00567626953125, + "loss": 0.0064, + "step": 3496, + "total_loss": 0.00567626953125 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019008175537626464, + "lm_loss": 0.0024566650390625, + "loss": 0.0079, + "step": 3497, + "total_loss": 0.0024566650390625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019007617788495645, + "lm_loss": 0.003204345703125, + "loss": 0.0063, + "step": 3498, + "total_loss": 0.003204345703125 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019007059890772365, + "lm_loss": 0.00439453125, + "loss": 0.008, + "step": 3499, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019006501844465824, + "lm_loss": 0.0125732421875, + "loss": 0.0088, + "step": 3500, + "total_loss": 0.0125732421875 + }, + { + "epoch": 1.43, + "eval_lm_loss": 0.009533802978694439, + "eval_loss": 0.009850949980318546, + "eval_runtime": 44.076, + "eval_samples_per_second": 22.688, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.009533802978694439, + "lm_loss": 0.00109100341796875, + "step": 3500, + "total_loss": 0.00109100341796875 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001900594364958522, + "lm_loss": 0.0089111328125, + "loss": 0.0067, + "step": 3501, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001900538530613977, + "lm_loss": 0.007476806640625, + "loss": 0.0069, + "step": 3502, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019004826814138684, + "lm_loss": 0.005615234375, + "loss": 0.0071, + "step": 3503, + "total_loss": 0.005615234375 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019004268173591172, + "lm_loss": 0.0042724609375, + "loss": 0.0067, + "step": 3504, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001900370938450645, + "lm_loss": 0.01422119140625, + "loss": 0.0092, + "step": 3505, + "total_loss": 0.01422119140625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019003150446893736, + "lm_loss": 0.006805419921875, + "loss": 0.0081, + "step": 3506, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019002591360762252, + "lm_loss": 0.01043701171875, + "loss": 0.0095, + "step": 3507, + "total_loss": 0.01043701171875 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019002032126121219, + "lm_loss": 0.0025177001953125, + "loss": 0.0077, + "step": 3508, + "total_loss": 0.0025177001953125 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019001472742979862, + "lm_loss": 0.0084228515625, + "loss": 0.0072, + "step": 3509, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019000913211347412, + "lm_loss": 0.0079345703125, + "loss": 0.0083, + "step": 3510, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.44, + "learning_rate": 0.00019000353531233098, + "lm_loss": 0.009765625, + "loss": 0.0089, + "step": 3511, + "total_loss": 0.009765625 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001899979370264615, + "lm_loss": 0.0040283203125, + "loss": 0.0077, + "step": 3512, + "total_loss": 0.0040283203125 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018999233725595804, + "lm_loss": 0.0079345703125, + "loss": 0.0077, + "step": 3513, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.44, + "learning_rate": 0.000189986736000913, + "lm_loss": 0.005035400390625, + "loss": 0.0081, + "step": 3514, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018998113326141878, + "lm_loss": 0.00445556640625, + "loss": 0.008, + "step": 3515, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018997552903756776, + "lm_loss": 0.0107421875, + "loss": 0.0069, + "step": 3516, + "total_loss": 0.0107421875 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018996992332945243, + "lm_loss": 0.0205078125, + "loss": 0.0099, + "step": 3517, + "total_loss": 0.0205078125 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018996431613716526, + "lm_loss": 0.01239013671875, + "loss": 0.0075, + "step": 3518, + "total_loss": 0.01239013671875 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001899587074607987, + "lm_loss": 0.0106201171875, + "loss": 0.0066, + "step": 3519, + "total_loss": 0.0106201171875 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018995309730044535, + "lm_loss": 0.007476806640625, + "loss": 0.0075, + "step": 3520, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001899474856561977, + "lm_loss": 0.005767822265625, + "loss": 0.007, + "step": 3521, + "total_loss": 0.005767822265625 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018994187252814837, + "lm_loss": 0.005645751953125, + "loss": 0.0052, + "step": 3522, + "total_loss": 0.005645751953125 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001899362579163899, + "lm_loss": 0.0181884765625, + "loss": 0.0077, + "step": 3523, + "total_loss": 0.0181884765625 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018993064182101495, + "lm_loss": 0.01611328125, + "loss": 0.0077, + "step": 3524, + "total_loss": 0.01611328125 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018992502424211612, + "lm_loss": 0.004058837890625, + "loss": 0.0056, + "step": 3525, + "total_loss": 0.004058837890625 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018991940517978613, + "lm_loss": 0.005157470703125, + "loss": 0.0065, + "step": 3526, + "total_loss": 0.005157470703125 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018991378463411768, + "lm_loss": 0.003753662109375, + "loss": 0.0067, + "step": 3527, + "total_loss": 0.003753662109375 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018990816260520344, + "lm_loss": 0.00616455078125, + "loss": 0.0078, + "step": 3528, + "total_loss": 0.00616455078125 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018990253909313616, + "lm_loss": 0.00811767578125, + "loss": 0.0075, + "step": 3529, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018989691409800866, + "lm_loss": 0.012451171875, + "loss": 0.0095, + "step": 3530, + "total_loss": 0.012451171875 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018989128761991367, + "lm_loss": 0.00799560546875, + "loss": 0.0082, + "step": 3531, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018988565965894402, + "lm_loss": 0.00897216796875, + "loss": 0.0076, + "step": 3532, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001898800302151926, + "lm_loss": 0.0037384033203125, + "loss": 0.0066, + "step": 3533, + "total_loss": 0.0037384033203125 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001898743992887522, + "lm_loss": 0.00323486328125, + "loss": 0.0076, + "step": 3534, + "total_loss": 0.00323486328125 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018986876687971576, + "lm_loss": 0.0068359375, + "loss": 0.0067, + "step": 3535, + "total_loss": 0.0068359375 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018986313298817616, + "lm_loss": 0.01080322265625, + "loss": 0.0093, + "step": 3536, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018985749761422636, + "lm_loss": 0.005615234375, + "loss": 0.0086, + "step": 3537, + "total_loss": 0.005615234375 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018985186075795934, + "lm_loss": 0.01611328125, + "loss": 0.007, + "step": 3538, + "total_loss": 0.01611328125 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018984622241946807, + "lm_loss": 0.010498046875, + "loss": 0.0066, + "step": 3539, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018984058259884553, + "lm_loss": 0.00250244140625, + "loss": 0.0078, + "step": 3540, + "total_loss": 0.00250244140625 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018983494129618485, + "lm_loss": 0.003936767578125, + "loss": 0.0083, + "step": 3541, + "total_loss": 0.003936767578125 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018982929851157896, + "lm_loss": 0.00927734375, + "loss": 0.0089, + "step": 3542, + "total_loss": 0.00927734375 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018982365424512104, + "lm_loss": 0.00604248046875, + "loss": 0.0072, + "step": 3543, + "total_loss": 0.00604248046875 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018981800849690415, + "lm_loss": 0.005950927734375, + "loss": 0.0077, + "step": 3544, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018981236126702148, + "lm_loss": 0.0034332275390625, + "loss": 0.0065, + "step": 3545, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018980671255556614, + "lm_loss": 0.004058837890625, + "loss": 0.0085, + "step": 3546, + "total_loss": 0.004058837890625 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018980106236263133, + "lm_loss": 0.0093994140625, + "loss": 0.0103, + "step": 3547, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018979541068831024, + "lm_loss": 0.007537841796875, + "loss": 0.0081, + "step": 3548, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018978975753269613, + "lm_loss": 0.0096435546875, + "loss": 0.0065, + "step": 3549, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018978410289588224, + "lm_loss": 0.0034637451171875, + "loss": 0.0076, + "step": 3550, + "total_loss": 0.0034637451171875 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018977844677796186, + "lm_loss": 0.00701904296875, + "loss": 0.0069, + "step": 3551, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018977278917902828, + "lm_loss": 0.00909423828125, + "loss": 0.007, + "step": 3552, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018976713009917484, + "lm_loss": 0.00830078125, + "loss": 0.0085, + "step": 3553, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018976146953849488, + "lm_loss": 0.0137939453125, + "loss": 0.0085, + "step": 3554, + "total_loss": 0.0137939453125 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001897558074970818, + "lm_loss": 0.016845703125, + "loss": 0.0084, + "step": 3555, + "total_loss": 0.016845703125 + }, + { + "epoch": 1.45, + "learning_rate": 0.000189750143975029, + "lm_loss": 0.00933837890625, + "loss": 0.0073, + "step": 3556, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018974447897242988, + "lm_loss": 0.007659912109375, + "loss": 0.0078, + "step": 3557, + "total_loss": 0.007659912109375 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018973881248937792, + "lm_loss": 0.0052490234375, + "loss": 0.0064, + "step": 3558, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001897331445259666, + "lm_loss": 0.005462646484375, + "loss": 0.008, + "step": 3559, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018972747508228942, + "lm_loss": 0.00958251953125, + "loss": 0.0086, + "step": 3560, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018972180415843982, + "lm_loss": 0.007293701171875, + "loss": 0.0073, + "step": 3561, + "total_loss": 0.007293701171875 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001897161317545115, + "lm_loss": 0.0081787109375, + "loss": 0.0071, + "step": 3562, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018971045787059793, + "lm_loss": 0.0096435546875, + "loss": 0.0073, + "step": 3563, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018970478250679275, + "lm_loss": 0.005615234375, + "loss": 0.0075, + "step": 3564, + "total_loss": 0.005615234375 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018969910566318953, + "lm_loss": 0.0068359375, + "loss": 0.0083, + "step": 3565, + "total_loss": 0.0068359375 + }, + { + "epoch": 1.46, + "learning_rate": 0.000189693427339882, + "lm_loss": 0.00555419921875, + "loss": 0.007, + "step": 3566, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018968774753696374, + "lm_loss": 0.003570556640625, + "loss": 0.0069, + "step": 3567, + "total_loss": 0.003570556640625 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018968206625452852, + "lm_loss": 0.003997802734375, + "loss": 0.0064, + "step": 3568, + "total_loss": 0.003997802734375 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018967638349267, + "lm_loss": 0.00567626953125, + "loss": 0.0072, + "step": 3569, + "total_loss": 0.00567626953125 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018967069925148199, + "lm_loss": 0.006378173828125, + "loss": 0.0071, + "step": 3570, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001896650135310582, + "lm_loss": 0.01348876953125, + "loss": 0.0075, + "step": 3571, + "total_loss": 0.01348876953125 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001896593263314925, + "lm_loss": 0.0037689208984375, + "loss": 0.0063, + "step": 3572, + "total_loss": 0.0037689208984375 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018965363765287857, + "lm_loss": 0.007080078125, + "loss": 0.007, + "step": 3573, + "total_loss": 0.007080078125 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001896479474953104, + "lm_loss": 0.006195068359375, + "loss": 0.0069, + "step": 3574, + "total_loss": 0.006195068359375 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018964225585888178, + "lm_loss": 0.00775146484375, + "loss": 0.0075, + "step": 3575, + "total_loss": 0.00775146484375 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001896365627436866, + "lm_loss": 0.0130615234375, + "loss": 0.0091, + "step": 3576, + "total_loss": 0.0130615234375 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018963086814981884, + "lm_loss": 0.00102996826171875, + "loss": 0.0069, + "step": 3577, + "total_loss": 0.00102996826171875 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018962517207737233, + "lm_loss": 0.010498046875, + "loss": 0.0064, + "step": 3578, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018961947452644115, + "lm_loss": 0.01104736328125, + "loss": 0.0078, + "step": 3579, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018961377549711917, + "lm_loss": 0.006927490234375, + "loss": 0.0075, + "step": 3580, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018960807498950052, + "lm_loss": 0.0036468505859375, + "loss": 0.0071, + "step": 3581, + "total_loss": 0.0036468505859375 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001896023730036792, + "lm_loss": 0.006622314453125, + "loss": 0.0079, + "step": 3582, + "total_loss": 0.006622314453125 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001895966695397492, + "lm_loss": 0.0079345703125, + "loss": 0.0083, + "step": 3583, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018959096459780469, + "lm_loss": 0.00421142578125, + "loss": 0.0056, + "step": 3584, + "total_loss": 0.00421142578125 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018958525817793976, + "lm_loss": 0.0036163330078125, + "loss": 0.0088, + "step": 3585, + "total_loss": 0.0036163330078125 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018957955028024853, + "lm_loss": 0.0079345703125, + "loss": 0.0059, + "step": 3586, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018957384090482514, + "lm_loss": 0.00193023681640625, + "loss": 0.0077, + "step": 3587, + "total_loss": 0.00193023681640625 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001895681300517638, + "lm_loss": 0.003509521484375, + "loss": 0.0074, + "step": 3588, + "total_loss": 0.003509521484375 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018956241772115874, + "lm_loss": 0.00799560546875, + "loss": 0.0085, + "step": 3589, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001895567039131042, + "lm_loss": 0.00830078125, + "loss": 0.0078, + "step": 3590, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018955098862769436, + "lm_loss": 0.0072021484375, + "loss": 0.0084, + "step": 3591, + "total_loss": 0.0072021484375 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018954527186502358, + "lm_loss": 0.004180908203125, + "loss": 0.0069, + "step": 3592, + "total_loss": 0.004180908203125 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018953955362518613, + "lm_loss": 0.00421142578125, + "loss": 0.0073, + "step": 3593, + "total_loss": 0.00421142578125 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018953383390827634, + "lm_loss": 0.007476806640625, + "loss": 0.0089, + "step": 3594, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018952811271438856, + "lm_loss": 0.006744384765625, + "loss": 0.0065, + "step": 3595, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018952239004361716, + "lm_loss": 0.0025482177734375, + "loss": 0.0059, + "step": 3596, + "total_loss": 0.0025482177734375 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001895166658960566, + "lm_loss": 0.00958251953125, + "loss": 0.0068, + "step": 3597, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018951094027180122, + "lm_loss": 0.00933837890625, + "loss": 0.0079, + "step": 3598, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018950521317094557, + "lm_loss": 0.004547119140625, + "loss": 0.0061, + "step": 3599, + "total_loss": 0.004547119140625 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018949948459358402, + "lm_loss": 0.01422119140625, + "loss": 0.0097, + "step": 3600, + "total_loss": 0.01422119140625 + }, + { + "epoch": 1.47, + "eval_lm_loss": 0.009612790308892727, + "eval_loss": 0.010066261515021324, + "eval_runtime": 43.9934, + "eval_samples_per_second": 22.731, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009612790308892727, + "lm_loss": 0.0012054443359375, + "step": 3600, + "total_loss": 0.0012054443359375 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018949375453981116, + "lm_loss": 0.003814697265625, + "loss": 0.0072, + "step": 3601, + "total_loss": 0.003814697265625 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001894880230097215, + "lm_loss": 0.0050048828125, + "loss": 0.0074, + "step": 3602, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018948229000340954, + "lm_loss": 0.006561279296875, + "loss": 0.0048, + "step": 3603, + "total_loss": 0.006561279296875 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018947655552096988, + "lm_loss": 0.0032958984375, + "loss": 0.0088, + "step": 3604, + "total_loss": 0.0032958984375 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001894708195624971, + "lm_loss": 0.0101318359375, + "loss": 0.0088, + "step": 3605, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001894650821280859, + "lm_loss": 0.006072998046875, + "loss": 0.0055, + "step": 3606, + "total_loss": 0.006072998046875 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018945934321783084, + "lm_loss": 0.01214599609375, + "loss": 0.0084, + "step": 3607, + "total_loss": 0.01214599609375 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001894536028318266, + "lm_loss": 0.00665283203125, + "loss": 0.0067, + "step": 3608, + "total_loss": 0.00665283203125 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001894478609701679, + "lm_loss": 0.00555419921875, + "loss": 0.0073, + "step": 3609, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018944211763294948, + "lm_loss": 0.004852294921875, + "loss": 0.0091, + "step": 3610, + "total_loss": 0.004852294921875 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018943637282026603, + "lm_loss": 0.01416015625, + "loss": 0.0077, + "step": 3611, + "total_loss": 0.01416015625 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018943062653221238, + "lm_loss": 0.00579833984375, + "loss": 0.0089, + "step": 3612, + "total_loss": 0.00579833984375 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018942487876888326, + "lm_loss": 0.01104736328125, + "loss": 0.0075, + "step": 3613, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001894191295303735, + "lm_loss": 0.01104736328125, + "loss": 0.0073, + "step": 3614, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.48, + "learning_rate": 0.000189413378816778, + "lm_loss": 0.006317138671875, + "loss": 0.0093, + "step": 3615, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018940762662819158, + "lm_loss": 0.008544921875, + "loss": 0.0084, + "step": 3616, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018940187296470913, + "lm_loss": 0.008056640625, + "loss": 0.0089, + "step": 3617, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018939611782642555, + "lm_loss": 0.0120849609375, + "loss": 0.0095, + "step": 3618, + "total_loss": 0.0120849609375 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001893903612134358, + "lm_loss": 0.00738525390625, + "loss": 0.0079, + "step": 3619, + "total_loss": 0.00738525390625 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018938460312583483, + "lm_loss": 0.004119873046875, + "loss": 0.0071, + "step": 3620, + "total_loss": 0.004119873046875 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018937884356371763, + "lm_loss": 0.01043701171875, + "loss": 0.0078, + "step": 3621, + "total_loss": 0.01043701171875 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018937308252717924, + "lm_loss": 0.0028076171875, + "loss": 0.0076, + "step": 3622, + "total_loss": 0.0028076171875 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018936732001631468, + "lm_loss": 0.006011962890625, + "loss": 0.0057, + "step": 3623, + "total_loss": 0.006011962890625 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018936155603121896, + "lm_loss": 0.005523681640625, + "loss": 0.0077, + "step": 3624, + "total_loss": 0.005523681640625 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018935579057198723, + "lm_loss": 0.005218505859375, + "loss": 0.0068, + "step": 3625, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001893500236387146, + "lm_loss": 0.002349853515625, + "loss": 0.0065, + "step": 3626, + "total_loss": 0.002349853515625 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018934425523149616, + "lm_loss": 0.0089111328125, + "loss": 0.0065, + "step": 3627, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018933848535042708, + "lm_loss": 0.009765625, + "loss": 0.0069, + "step": 3628, + "total_loss": 0.009765625 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018933271399560256, + "lm_loss": 0.0098876953125, + "loss": 0.0091, + "step": 3629, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001893269411671178, + "lm_loss": 0.0093994140625, + "loss": 0.0075, + "step": 3630, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.48, + "learning_rate": 0.000189321166865068, + "lm_loss": 0.0093994140625, + "loss": 0.0073, + "step": 3631, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018931539108954845, + "lm_loss": 0.006561279296875, + "loss": 0.0068, + "step": 3632, + "total_loss": 0.006561279296875 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018930961384065444, + "lm_loss": 0.007537841796875, + "loss": 0.0088, + "step": 3633, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018930383511848122, + "lm_loss": 0.00188446044921875, + "loss": 0.0081, + "step": 3634, + "total_loss": 0.00188446044921875 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018929805492312417, + "lm_loss": 0.01318359375, + "loss": 0.0084, + "step": 3635, + "total_loss": 0.01318359375 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018929227325467863, + "lm_loss": 0.006072998046875, + "loss": 0.0081, + "step": 3636, + "total_loss": 0.006072998046875 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018928649011323994, + "lm_loss": 0.00811767578125, + "loss": 0.0075, + "step": 3637, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018928070549890355, + "lm_loss": 0.0016632080078125, + "loss": 0.0069, + "step": 3638, + "total_loss": 0.0016632080078125 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018927491941176484, + "lm_loss": 0.006256103515625, + "loss": 0.0093, + "step": 3639, + "total_loss": 0.006256103515625 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018926913185191932, + "lm_loss": 0.00714111328125, + "loss": 0.0087, + "step": 3640, + "total_loss": 0.00714111328125 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018926334281946241, + "lm_loss": 0.0029144287109375, + "loss": 0.0056, + "step": 3641, + "total_loss": 0.0029144287109375 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001892575523144896, + "lm_loss": 0.006683349609375, + "loss": 0.0077, + "step": 3642, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018925176033709646, + "lm_loss": 0.0047607421875, + "loss": 0.0068, + "step": 3643, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001892459668873785, + "lm_loss": 0.00799560546875, + "loss": 0.0073, + "step": 3644, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001892401719654313, + "lm_loss": 0.00897216796875, + "loss": 0.0063, + "step": 3645, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018923437557135045, + "lm_loss": 0.00640869140625, + "loss": 0.0066, + "step": 3646, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001892285777052316, + "lm_loss": 0.005950927734375, + "loss": 0.0078, + "step": 3647, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018922277836717037, + "lm_loss": 0.005401611328125, + "loss": 0.0078, + "step": 3648, + "total_loss": 0.005401611328125 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018921697755726243, + "lm_loss": 0.0081787109375, + "loss": 0.0076, + "step": 3649, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018921117527560344, + "lm_loss": 0.00616455078125, + "loss": 0.008, + "step": 3650, + "total_loss": 0.00616455078125 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018920537152228916, + "lm_loss": 0.00830078125, + "loss": 0.0078, + "step": 3651, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018919956629741532, + "lm_loss": 0.0029449462890625, + "loss": 0.008, + "step": 3652, + "total_loss": 0.0029449462890625 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001891937596010777, + "lm_loss": 0.004669189453125, + "loss": 0.0055, + "step": 3653, + "total_loss": 0.004669189453125 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018918795143337203, + "lm_loss": 0.006591796875, + "loss": 0.0077, + "step": 3654, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018918214179439418, + "lm_loss": 0.007171630859375, + "loss": 0.0066, + "step": 3655, + "total_loss": 0.007171630859375 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018917633068423997, + "lm_loss": 0.0052490234375, + "loss": 0.0089, + "step": 3656, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018917051810300525, + "lm_loss": 0.021484375, + "loss": 0.0074, + "step": 3657, + "total_loss": 0.021484375 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018916470405078593, + "lm_loss": 0.01031494140625, + "loss": 0.0078, + "step": 3658, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001891588885276779, + "lm_loss": 0.00726318359375, + "loss": 0.0079, + "step": 3659, + "total_loss": 0.00726318359375 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018915307153377712, + "lm_loss": 0.00653076171875, + "loss": 0.0071, + "step": 3660, + "total_loss": 0.00653076171875 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018914725306917954, + "lm_loss": 0.01318359375, + "loss": 0.0092, + "step": 3661, + "total_loss": 0.01318359375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001891414331339811, + "lm_loss": 0.0023956298828125, + "loss": 0.0069, + "step": 3662, + "total_loss": 0.0023956298828125 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018913561172827787, + "lm_loss": 0.020751953125, + "loss": 0.0085, + "step": 3663, + "total_loss": 0.020751953125 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018912978885216587, + "lm_loss": 0.005523681640625, + "loss": 0.0068, + "step": 3664, + "total_loss": 0.005523681640625 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018912396450574111, + "lm_loss": 0.00567626953125, + "loss": 0.0058, + "step": 3665, + "total_loss": 0.00567626953125 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001891181386890997, + "lm_loss": 0.0111083984375, + "loss": 0.0076, + "step": 3666, + "total_loss": 0.0111083984375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001891123114023378, + "lm_loss": 0.0130615234375, + "loss": 0.0097, + "step": 3667, + "total_loss": 0.0130615234375 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018910648264555143, + "lm_loss": 0.0035247802734375, + "loss": 0.0072, + "step": 3668, + "total_loss": 0.0035247802734375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001891006524188368, + "lm_loss": 0.007415771484375, + "loss": 0.0073, + "step": 3669, + "total_loss": 0.007415771484375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001890948207222901, + "lm_loss": 0.00799560546875, + "loss": 0.0067, + "step": 3670, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001890889875560075, + "lm_loss": 0.004058837890625, + "loss": 0.007, + "step": 3671, + "total_loss": 0.004058837890625 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018908315292008525, + "lm_loss": 0.01068115234375, + "loss": 0.0075, + "step": 3672, + "total_loss": 0.01068115234375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001890773168146196, + "lm_loss": 0.005950927734375, + "loss": 0.0073, + "step": 3673, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001890714792397068, + "lm_loss": 0.006683349609375, + "loss": 0.0056, + "step": 3674, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001890656401954432, + "lm_loss": 0.01611328125, + "loss": 0.0089, + "step": 3675, + "total_loss": 0.01611328125 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018905979968192506, + "lm_loss": 0.00823974609375, + "loss": 0.0067, + "step": 3676, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018905395769924877, + "lm_loss": 0.006744384765625, + "loss": 0.0061, + "step": 3677, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018904811424751063, + "lm_loss": 0.00994873046875, + "loss": 0.0076, + "step": 3678, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018904226932680716, + "lm_loss": 0.00531005859375, + "loss": 0.0074, + "step": 3679, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018903642293723467, + "lm_loss": 0.01129150390625, + "loss": 0.0072, + "step": 3680, + "total_loss": 0.01129150390625 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018903057507888967, + "lm_loss": 0.004302978515625, + "loss": 0.0076, + "step": 3681, + "total_loss": 0.004302978515625 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018902472575186858, + "lm_loss": 0.004669189453125, + "loss": 0.0067, + "step": 3682, + "total_loss": 0.004669189453125 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018901887495626795, + "lm_loss": 0.0196533203125, + "loss": 0.0079, + "step": 3683, + "total_loss": 0.0196533203125 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001890130226921842, + "lm_loss": 0.0037841796875, + "loss": 0.0071, + "step": 3684, + "total_loss": 0.0037841796875 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018900716895971396, + "lm_loss": 0.005035400390625, + "loss": 0.0074, + "step": 3685, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001890013137589538, + "lm_loss": 0.006103515625, + "loss": 0.0098, + "step": 3686, + "total_loss": 0.006103515625 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018899545709000025, + "lm_loss": 0.004302978515625, + "loss": 0.0062, + "step": 3687, + "total_loss": 0.004302978515625 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018898959895294993, + "lm_loss": 0.0087890625, + "loss": 0.0073, + "step": 3688, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018898373934789952, + "lm_loss": 0.004058837890625, + "loss": 0.0066, + "step": 3689, + "total_loss": 0.004058837890625 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018897787827494567, + "lm_loss": 0.006256103515625, + "loss": 0.0071, + "step": 3690, + "total_loss": 0.006256103515625 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018897201573418504, + "lm_loss": 0.01043701171875, + "loss": 0.0069, + "step": 3691, + "total_loss": 0.01043701171875 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018896615172571437, + "lm_loss": 0.0032501220703125, + "loss": 0.0091, + "step": 3692, + "total_loss": 0.0032501220703125 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018896028624963037, + "lm_loss": 0.0030059814453125, + "loss": 0.0073, + "step": 3693, + "total_loss": 0.0030059814453125 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001889544193060298, + "lm_loss": 0.00836181640625, + "loss": 0.0079, + "step": 3694, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018894855089500947, + "lm_loss": 0.0086669921875, + "loss": 0.0089, + "step": 3695, + "total_loss": 0.0086669921875 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018894268101666615, + "lm_loss": 0.0115966796875, + "loss": 0.0091, + "step": 3696, + "total_loss": 0.0115966796875 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018893680967109667, + "lm_loss": 0.00537109375, + "loss": 0.009, + "step": 3697, + "total_loss": 0.00537109375 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018893093685839793, + "lm_loss": 0.0034942626953125, + "loss": 0.0077, + "step": 3698, + "total_loss": 0.0034942626953125 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001889250625786668, + "lm_loss": 0.005462646484375, + "loss": 0.0089, + "step": 3699, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018891918683200016, + "lm_loss": 0.0068359375, + "loss": 0.0103, + "step": 3700, + "total_loss": 0.0068359375 + }, + { + "epoch": 1.51, + "eval_lm_loss": 0.00946781411767006, + "eval_loss": 0.009776020422577858, + "eval_runtime": 44.1261, + "eval_samples_per_second": 22.662, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.00946781411767006, + "lm_loss": 0.0016632080078125, + "step": 3700, + "total_loss": 0.0016632080078125 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018891330961849496, + "lm_loss": 0.0064697265625, + "loss": 0.0094, + "step": 3701, + "total_loss": 0.0064697265625 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001889074309382481, + "lm_loss": 0.006591796875, + "loss": 0.0069, + "step": 3702, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018890155079135664, + "lm_loss": 0.01123046875, + "loss": 0.0094, + "step": 3703, + "total_loss": 0.01123046875 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018889566917791752, + "lm_loss": 0.006195068359375, + "loss": 0.0084, + "step": 3704, + "total_loss": 0.006195068359375 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018888978609802776, + "lm_loss": 0.013916015625, + "loss": 0.0084, + "step": 3705, + "total_loss": 0.013916015625 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018888390155178447, + "lm_loss": 0.005523681640625, + "loss": 0.0065, + "step": 3706, + "total_loss": 0.005523681640625 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018887801553928464, + "lm_loss": 0.004608154296875, + "loss": 0.0066, + "step": 3707, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018887212806062548, + "lm_loss": 0.006988525390625, + "loss": 0.0078, + "step": 3708, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.52, + "learning_rate": 0.000188866239115904, + "lm_loss": 0.00653076171875, + "loss": 0.0081, + "step": 3709, + "total_loss": 0.00653076171875 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018886034870521735, + "lm_loss": 0.00156402587890625, + "loss": 0.0096, + "step": 3710, + "total_loss": 0.00156402587890625 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001888544568286628, + "lm_loss": 0.00433349609375, + "loss": 0.0065, + "step": 3711, + "total_loss": 0.00433349609375 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018884856348633746, + "lm_loss": 0.00762939453125, + "loss": 0.0088, + "step": 3712, + "total_loss": 0.00762939453125 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018884266867833858, + "lm_loss": 0.006561279296875, + "loss": 0.0072, + "step": 3713, + "total_loss": 0.006561279296875 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001888367724047634, + "lm_loss": 0.01220703125, + "loss": 0.0082, + "step": 3714, + "total_loss": 0.01220703125 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018883087466570916, + "lm_loss": 0.00408935546875, + "loss": 0.0057, + "step": 3715, + "total_loss": 0.00408935546875 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018882497546127318, + "lm_loss": 0.00799560546875, + "loss": 0.0081, + "step": 3716, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018881907479155276, + "lm_loss": 0.00701904296875, + "loss": 0.0069, + "step": 3717, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001888131726566453, + "lm_loss": 0.005035400390625, + "loss": 0.0051, + "step": 3718, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018880726905664804, + "lm_loss": 0.01348876953125, + "loss": 0.0077, + "step": 3719, + "total_loss": 0.01348876953125 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018880136399165845, + "lm_loss": 0.009033203125, + "loss": 0.0061, + "step": 3720, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018879545746177396, + "lm_loss": 0.0098876953125, + "loss": 0.0075, + "step": 3721, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018878954946709197, + "lm_loss": 0.0126953125, + "loss": 0.0081, + "step": 3722, + "total_loss": 0.0126953125 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018878364000770993, + "lm_loss": 0.006591796875, + "loss": 0.0071, + "step": 3723, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018877772908372536, + "lm_loss": 0.0050048828125, + "loss": 0.0064, + "step": 3724, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018877181669523573, + "lm_loss": 0.00811767578125, + "loss": 0.0075, + "step": 3725, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001887659028423386, + "lm_loss": 0.0079345703125, + "loss": 0.008, + "step": 3726, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001887599875251315, + "lm_loss": 0.006195068359375, + "loss": 0.006, + "step": 3727, + "total_loss": 0.006195068359375 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018875407074371207, + "lm_loss": 0.01507568359375, + "loss": 0.0068, + "step": 3728, + "total_loss": 0.01507568359375 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018874815249817783, + "lm_loss": 0.00830078125, + "loss": 0.0079, + "step": 3729, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018874223278862647, + "lm_loss": 0.013916015625, + "loss": 0.0077, + "step": 3730, + "total_loss": 0.013916015625 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018873631161515564, + "lm_loss": 0.004974365234375, + "loss": 0.0077, + "step": 3731, + "total_loss": 0.004974365234375 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018873038897786299, + "lm_loss": 0.006500244140625, + "loss": 0.0065, + "step": 3732, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018872446487684625, + "lm_loss": 0.006500244140625, + "loss": 0.0064, + "step": 3733, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018871853931220312, + "lm_loss": 0.01007080078125, + "loss": 0.0063, + "step": 3734, + "total_loss": 0.01007080078125 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018871261228403134, + "lm_loss": 0.005584716796875, + "loss": 0.0076, + "step": 3735, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018870668379242873, + "lm_loss": 0.00732421875, + "loss": 0.0057, + "step": 3736, + "total_loss": 0.00732421875 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018870075383749306, + "lm_loss": 0.00604248046875, + "loss": 0.0076, + "step": 3737, + "total_loss": 0.00604248046875 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018869482241932217, + "lm_loss": 0.01031494140625, + "loss": 0.0095, + "step": 3738, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018868888953801387, + "lm_loss": 0.00830078125, + "loss": 0.0077, + "step": 3739, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018868295519366605, + "lm_loss": 0.0029144287109375, + "loss": 0.0084, + "step": 3740, + "total_loss": 0.0029144287109375 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018867701938637664, + "lm_loss": 0.00372314453125, + "loss": 0.0054, + "step": 3741, + "total_loss": 0.00372314453125 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001886710821162435, + "lm_loss": 0.01031494140625, + "loss": 0.0064, + "step": 3742, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001886651433833646, + "lm_loss": 0.0023193359375, + "loss": 0.0071, + "step": 3743, + "total_loss": 0.0023193359375 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018865920318783793, + "lm_loss": 0.009521484375, + "loss": 0.0067, + "step": 3744, + "total_loss": 0.009521484375 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018865326152976141, + "lm_loss": 0.0093994140625, + "loss": 0.0055, + "step": 3745, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018864731840923315, + "lm_loss": 0.007537841796875, + "loss": 0.0079, + "step": 3746, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018864137382635114, + "lm_loss": 0.0166015625, + "loss": 0.0066, + "step": 3747, + "total_loss": 0.0166015625 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018863542778121343, + "lm_loss": 0.004638671875, + "loss": 0.0077, + "step": 3748, + "total_loss": 0.004638671875 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018862948027391813, + "lm_loss": 0.007781982421875, + "loss": 0.0064, + "step": 3749, + "total_loss": 0.007781982421875 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018862353130456337, + "lm_loss": 0.01220703125, + "loss": 0.0078, + "step": 3750, + "total_loss": 0.01220703125 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018861758087324723, + "lm_loss": 0.008544921875, + "loss": 0.0072, + "step": 3751, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001886116289800679, + "lm_loss": 0.004547119140625, + "loss": 0.0051, + "step": 3752, + "total_loss": 0.004547119140625 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018860567562512358, + "lm_loss": 0.00946044921875, + "loss": 0.0065, + "step": 3753, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001885997208085125, + "lm_loss": 0.004791259765625, + "loss": 0.0062, + "step": 3754, + "total_loss": 0.004791259765625 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018859376453033282, + "lm_loss": 0.0084228515625, + "loss": 0.008, + "step": 3755, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018858780679068283, + "lm_loss": 0.00689697265625, + "loss": 0.0079, + "step": 3756, + "total_loss": 0.00689697265625 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018858184758966084, + "lm_loss": 0.0035858154296875, + "loss": 0.0064, + "step": 3757, + "total_loss": 0.0035858154296875 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018857588692736512, + "lm_loss": 0.01446533203125, + "loss": 0.0069, + "step": 3758, + "total_loss": 0.01446533203125 + }, + { + "epoch": 1.54, + "learning_rate": 0.000188569924803894, + "lm_loss": 0.005767822265625, + "loss": 0.007, + "step": 3759, + "total_loss": 0.005767822265625 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018856396121934586, + "lm_loss": 0.0064697265625, + "loss": 0.0077, + "step": 3760, + "total_loss": 0.0064697265625 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018855799617381903, + "lm_loss": 0.0076904296875, + "loss": 0.0078, + "step": 3761, + "total_loss": 0.0076904296875 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018855202966741198, + "lm_loss": 0.0101318359375, + "loss": 0.0074, + "step": 3762, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018854606170022308, + "lm_loss": 0.0025787353515625, + "loss": 0.0063, + "step": 3763, + "total_loss": 0.0025787353515625 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018854009227235082, + "lm_loss": 0.004364013671875, + "loss": 0.0069, + "step": 3764, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001885341213838936, + "lm_loss": 0.0013885498046875, + "loss": 0.0093, + "step": 3765, + "total_loss": 0.0013885498046875 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018852814903494997, + "lm_loss": 0.0027313232421875, + "loss": 0.0061, + "step": 3766, + "total_loss": 0.0027313232421875 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001885221752256185, + "lm_loss": 0.0023193359375, + "loss": 0.0061, + "step": 3767, + "total_loss": 0.0023193359375 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018851619995599762, + "lm_loss": 0.016357421875, + "loss": 0.0075, + "step": 3768, + "total_loss": 0.016357421875 + }, + { + "epoch": 1.54, + "learning_rate": 0.000188510223226186, + "lm_loss": 0.00836181640625, + "loss": 0.0067, + "step": 3769, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001885042450362822, + "lm_loss": 0.00836181640625, + "loss": 0.0076, + "step": 3770, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018849826538638485, + "lm_loss": 0.005157470703125, + "loss": 0.0072, + "step": 3771, + "total_loss": 0.005157470703125 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018849228427659257, + "lm_loss": 0.0045166015625, + "loss": 0.0065, + "step": 3772, + "total_loss": 0.0045166015625 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018848630170700402, + "lm_loss": 0.0028228759765625, + "loss": 0.0058, + "step": 3773, + "total_loss": 0.0028228759765625 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001884803176777179, + "lm_loss": 0.006134033203125, + "loss": 0.0072, + "step": 3774, + "total_loss": 0.006134033203125 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018847433218883294, + "lm_loss": 0.0069580078125, + "loss": 0.0078, + "step": 3775, + "total_loss": 0.0069580078125 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018846834524044787, + "lm_loss": 0.006622314453125, + "loss": 0.0071, + "step": 3776, + "total_loss": 0.006622314453125 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018846235683266148, + "lm_loss": 0.004302978515625, + "loss": 0.0054, + "step": 3777, + "total_loss": 0.004302978515625 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018845636696557247, + "lm_loss": 0.004913330078125, + "loss": 0.0077, + "step": 3778, + "total_loss": 0.004913330078125 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018845037563927975, + "lm_loss": 0.009033203125, + "loss": 0.0104, + "step": 3779, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018844438285388207, + "lm_loss": 0.006866455078125, + "loss": 0.0081, + "step": 3780, + "total_loss": 0.006866455078125 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018843838860947836, + "lm_loss": 0.00958251953125, + "loss": 0.0075, + "step": 3781, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001884323929061675, + "lm_loss": 0.00408935546875, + "loss": 0.0058, + "step": 3782, + "total_loss": 0.00408935546875 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018842639574404832, + "lm_loss": 0.005584716796875, + "loss": 0.0072, + "step": 3783, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018842039712321985, + "lm_loss": 0.00445556640625, + "loss": 0.0088, + "step": 3784, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018841439704378097, + "lm_loss": 0.00701904296875, + "loss": 0.0074, + "step": 3785, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018840839550583068, + "lm_loss": 0.008544921875, + "loss": 0.0073, + "step": 3786, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018840239250946802, + "lm_loss": 0.0106201171875, + "loss": 0.0067, + "step": 3787, + "total_loss": 0.0106201171875 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018839638805479196, + "lm_loss": 0.00726318359375, + "loss": 0.009, + "step": 3788, + "total_loss": 0.00726318359375 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001883903821419016, + "lm_loss": 0.005279541015625, + "loss": 0.0074, + "step": 3789, + "total_loss": 0.005279541015625 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018838437477089598, + "lm_loss": 0.007354736328125, + "loss": 0.0069, + "step": 3790, + "total_loss": 0.007354736328125 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018837836594187422, + "lm_loss": 0.00921630859375, + "loss": 0.0093, + "step": 3791, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018837235565493543, + "lm_loss": 0.0118408203125, + "loss": 0.0066, + "step": 3792, + "total_loss": 0.0118408203125 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018836634391017877, + "lm_loss": 0.007598876953125, + "loss": 0.0076, + "step": 3793, + "total_loss": 0.007598876953125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001883603307077034, + "lm_loss": 0.00830078125, + "loss": 0.0074, + "step": 3794, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018835431604760853, + "lm_loss": 0.009765625, + "loss": 0.0087, + "step": 3795, + "total_loss": 0.009765625 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018834829992999338, + "lm_loss": 0.0028533935546875, + "loss": 0.0056, + "step": 3796, + "total_loss": 0.0028533935546875 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018834228235495716, + "lm_loss": 0.010009765625, + "loss": 0.008, + "step": 3797, + "total_loss": 0.010009765625 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018833626332259918, + "lm_loss": 0.00994873046875, + "loss": 0.0061, + "step": 3798, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001883302428330187, + "lm_loss": 0.006378173828125, + "loss": 0.0069, + "step": 3799, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018832422088631507, + "lm_loss": 0.002593994140625, + "loss": 0.0094, + "step": 3800, + "total_loss": 0.002593994140625 + }, + { + "epoch": 1.55, + "eval_lm_loss": 0.00954235065728426, + "eval_loss": 0.009984311647713184, + "eval_runtime": 43.8645, + "eval_samples_per_second": 22.797, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.00954235065728426, + "lm_loss": 0.0020599365234375, + "step": 3800, + "total_loss": 0.0020599365234375 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001883181974825876, + "lm_loss": 0.006011962890625, + "loss": 0.0074, + "step": 3801, + "total_loss": 0.006011962890625 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018831217262193568, + "lm_loss": 0.005859375, + "loss": 0.01, + "step": 3802, + "total_loss": 0.005859375 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018830614630445866, + "lm_loss": 0.008056640625, + "loss": 0.0079, + "step": 3803, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.56, + "learning_rate": 0.000188300118530256, + "lm_loss": 0.005950927734375, + "loss": 0.0075, + "step": 3804, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001882940892994271, + "lm_loss": 0.004730224609375, + "loss": 0.0084, + "step": 3805, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001882880586120714, + "lm_loss": 0.00433349609375, + "loss": 0.0072, + "step": 3806, + "total_loss": 0.00433349609375 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018828202646828848, + "lm_loss": 0.0091552734375, + "loss": 0.0072, + "step": 3807, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018827599286817774, + "lm_loss": 0.004119873046875, + "loss": 0.0066, + "step": 3808, + "total_loss": 0.004119873046875 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018826995781183876, + "lm_loss": 0.007080078125, + "loss": 0.0069, + "step": 3809, + "total_loss": 0.007080078125 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018826392129937109, + "lm_loss": 0.0113525390625, + "loss": 0.0077, + "step": 3810, + "total_loss": 0.0113525390625 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018825788333087432, + "lm_loss": 0.006866455078125, + "loss": 0.0057, + "step": 3811, + "total_loss": 0.006866455078125 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018825184390644804, + "lm_loss": 0.006927490234375, + "loss": 0.008, + "step": 3812, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018824580302619185, + "lm_loss": 0.0059814453125, + "loss": 0.0082, + "step": 3813, + "total_loss": 0.0059814453125 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001882397606902055, + "lm_loss": 0.0037994384765625, + "loss": 0.0091, + "step": 3814, + "total_loss": 0.0037994384765625 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018823371689858856, + "lm_loss": 0.005828857421875, + "loss": 0.0072, + "step": 3815, + "total_loss": 0.005828857421875 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001882276716514408, + "lm_loss": 0.0021514892578125, + "loss": 0.0071, + "step": 3816, + "total_loss": 0.0021514892578125 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001882216249488619, + "lm_loss": 0.01123046875, + "loss": 0.0074, + "step": 3817, + "total_loss": 0.01123046875 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001882155767909516, + "lm_loss": 0.0203857421875, + "loss": 0.0082, + "step": 3818, + "total_loss": 0.0203857421875 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018820952717780976, + "lm_loss": 0.00933837890625, + "loss": 0.0097, + "step": 3819, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018820347610953606, + "lm_loss": 0.0034332275390625, + "loss": 0.007, + "step": 3820, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001881974235862304, + "lm_loss": 0.00537109375, + "loss": 0.0082, + "step": 3821, + "total_loss": 0.00537109375 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018819136960799258, + "lm_loss": 0.0087890625, + "loss": 0.0073, + "step": 3822, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018818531417492254, + "lm_loss": 0.00384521484375, + "loss": 0.0054, + "step": 3823, + "total_loss": 0.00384521484375 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018817925728712006, + "lm_loss": 0.0032958984375, + "loss": 0.0097, + "step": 3824, + "total_loss": 0.0032958984375 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018817319894468514, + "lm_loss": 0.01025390625, + "loss": 0.0084, + "step": 3825, + "total_loss": 0.01025390625 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001881671391477177, + "lm_loss": 0.0064697265625, + "loss": 0.0071, + "step": 3826, + "total_loss": 0.0064697265625 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018816107789631768, + "lm_loss": 0.01806640625, + "loss": 0.0087, + "step": 3827, + "total_loss": 0.01806640625 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018815501519058508, + "lm_loss": 0.0076904296875, + "loss": 0.0079, + "step": 3828, + "total_loss": 0.0076904296875 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018814895103061994, + "lm_loss": 0.0078125, + "loss": 0.0076, + "step": 3829, + "total_loss": 0.0078125 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018814288541652227, + "lm_loss": 0.006927490234375, + "loss": 0.0075, + "step": 3830, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018813681834839216, + "lm_loss": 0.0128173828125, + "loss": 0.0074, + "step": 3831, + "total_loss": 0.0128173828125 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018813074982632966, + "lm_loss": 0.008056640625, + "loss": 0.006, + "step": 3832, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001881246798504349, + "lm_loss": 0.005096435546875, + "loss": 0.008, + "step": 3833, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018811860842080796, + "lm_loss": 0.004180908203125, + "loss": 0.006, + "step": 3834, + "total_loss": 0.004180908203125 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018811253553754908, + "lm_loss": 0.005767822265625, + "loss": 0.0089, + "step": 3835, + "total_loss": 0.005767822265625 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018810646120075836, + "lm_loss": 0.006805419921875, + "loss": 0.0067, + "step": 3836, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018810038541053608, + "lm_loss": 0.0032501220703125, + "loss": 0.0078, + "step": 3837, + "total_loss": 0.0032501220703125 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018809430816698242, + "lm_loss": 0.0106201171875, + "loss": 0.0067, + "step": 3838, + "total_loss": 0.0106201171875 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018808822947019764, + "lm_loss": 0.006744384765625, + "loss": 0.0093, + "step": 3839, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.57, + "learning_rate": 0.000188082149320282, + "lm_loss": 0.00177001953125, + "loss": 0.007, + "step": 3840, + "total_loss": 0.00177001953125 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018807606771733583, + "lm_loss": 0.00408935546875, + "loss": 0.0092, + "step": 3841, + "total_loss": 0.00408935546875 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018806998466145943, + "lm_loss": 0.004150390625, + "loss": 0.0053, + "step": 3842, + "total_loss": 0.004150390625 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001880639001527532, + "lm_loss": 0.006988525390625, + "loss": 0.0105, + "step": 3843, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018805781419131744, + "lm_loss": 0.006805419921875, + "loss": 0.0071, + "step": 3844, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018805172677725262, + "lm_loss": 0.005645751953125, + "loss": 0.0081, + "step": 3845, + "total_loss": 0.005645751953125 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018804563791065908, + "lm_loss": 0.0059814453125, + "loss": 0.009, + "step": 3846, + "total_loss": 0.0059814453125 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001880395475916373, + "lm_loss": 0.00244140625, + "loss": 0.0082, + "step": 3847, + "total_loss": 0.00244140625 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018803345582028779, + "lm_loss": 0.00131988525390625, + "loss": 0.0062, + "step": 3848, + "total_loss": 0.00131988525390625 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018802736259671096, + "lm_loss": 0.00531005859375, + "loss": 0.0077, + "step": 3849, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001880212679210074, + "lm_loss": 0.008056640625, + "loss": 0.0069, + "step": 3850, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018801517179327756, + "lm_loss": 0.0031585693359375, + "loss": 0.0069, + "step": 3851, + "total_loss": 0.0031585693359375 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001880090742136221, + "lm_loss": 0.01104736328125, + "loss": 0.0063, + "step": 3852, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018800297518214157, + "lm_loss": 0.009033203125, + "loss": 0.0083, + "step": 3853, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018799687469893658, + "lm_loss": 0.00457763671875, + "loss": 0.0068, + "step": 3854, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018799077276410777, + "lm_loss": 0.00726318359375, + "loss": 0.0058, + "step": 3855, + "total_loss": 0.00726318359375 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001879846693777558, + "lm_loss": 0.006927490234375, + "loss": 0.0087, + "step": 3856, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018797856453998133, + "lm_loss": 0.006011962890625, + "loss": 0.0077, + "step": 3857, + "total_loss": 0.006011962890625 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018797245825088508, + "lm_loss": 0.0031280517578125, + "loss": 0.0075, + "step": 3858, + "total_loss": 0.0031280517578125 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018796635051056778, + "lm_loss": 0.0037841796875, + "loss": 0.0075, + "step": 3859, + "total_loss": 0.0037841796875 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018796024131913024, + "lm_loss": 0.00726318359375, + "loss": 0.0067, + "step": 3860, + "total_loss": 0.00726318359375 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018795413067667315, + "lm_loss": 0.0126953125, + "loss": 0.0081, + "step": 3861, + "total_loss": 0.0126953125 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018794801858329735, + "lm_loss": 0.003082275390625, + "loss": 0.0078, + "step": 3862, + "total_loss": 0.003082275390625 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001879419050391037, + "lm_loss": 0.01251220703125, + "loss": 0.0085, + "step": 3863, + "total_loss": 0.01251220703125 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018793579004419299, + "lm_loss": 0.0030975341796875, + "loss": 0.0052, + "step": 3864, + "total_loss": 0.0030975341796875 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018792967359866617, + "lm_loss": 0.0074462890625, + "loss": 0.0074, + "step": 3865, + "total_loss": 0.0074462890625 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018792355570262405, + "lm_loss": 0.0068359375, + "loss": 0.0072, + "step": 3866, + "total_loss": 0.0068359375 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018791743635616763, + "lm_loss": 0.0037994384765625, + "loss": 0.008, + "step": 3867, + "total_loss": 0.0037994384765625 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001879113155593978, + "lm_loss": 0.006988525390625, + "loss": 0.0081, + "step": 3868, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.58, + "learning_rate": 0.0001879051933124156, + "lm_loss": 0.012939453125, + "loss": 0.009, + "step": 3869, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018789906961532191, + "lm_loss": 0.006500244140625, + "loss": 0.0063, + "step": 3870, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018789294446821785, + "lm_loss": 0.0162353515625, + "loss": 0.0066, + "step": 3871, + "total_loss": 0.0162353515625 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018788681787120445, + "lm_loss": 0.004364013671875, + "loss": 0.0077, + "step": 3872, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018788068982438275, + "lm_loss": 0.0130615234375, + "loss": 0.0078, + "step": 3873, + "total_loss": 0.0130615234375 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018787456032785384, + "lm_loss": 0.0038909912109375, + "loss": 0.0078, + "step": 3874, + "total_loss": 0.0038909912109375 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018786842938171884, + "lm_loss": 0.005615234375, + "loss": 0.0072, + "step": 3875, + "total_loss": 0.005615234375 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018786229698607892, + "lm_loss": 0.0052490234375, + "loss": 0.0055, + "step": 3876, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018785616314103516, + "lm_loss": 0.0098876953125, + "loss": 0.0077, + "step": 3877, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018785002784668883, + "lm_loss": 0.00677490234375, + "loss": 0.0078, + "step": 3878, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001878438911031411, + "lm_loss": 0.0093994140625, + "loss": 0.0077, + "step": 3879, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001878377529104932, + "lm_loss": 0.003448486328125, + "loss": 0.0075, + "step": 3880, + "total_loss": 0.003448486328125 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018783161326884642, + "lm_loss": 0.006011962890625, + "loss": 0.0078, + "step": 3881, + "total_loss": 0.006011962890625 + }, + { + "epoch": 1.59, + "learning_rate": 0.000187825472178302, + "lm_loss": 0.00982666015625, + "loss": 0.0074, + "step": 3882, + "total_loss": 0.00982666015625 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018781932963896127, + "lm_loss": 0.004547119140625, + "loss": 0.0068, + "step": 3883, + "total_loss": 0.004547119140625 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018781318565092557, + "lm_loss": 0.0057373046875, + "loss": 0.0085, + "step": 3884, + "total_loss": 0.0057373046875 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001878070402142962, + "lm_loss": 0.004547119140625, + "loss": 0.0077, + "step": 3885, + "total_loss": 0.004547119140625 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018780089332917462, + "lm_loss": 0.004669189453125, + "loss": 0.0063, + "step": 3886, + "total_loss": 0.004669189453125 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018779474499566214, + "lm_loss": 0.00823974609375, + "loss": 0.008, + "step": 3887, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018778859521386025, + "lm_loss": 0.00506591796875, + "loss": 0.0063, + "step": 3888, + "total_loss": 0.00506591796875 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018778244398387037, + "lm_loss": 0.0038299560546875, + "loss": 0.0058, + "step": 3889, + "total_loss": 0.0038299560546875 + }, + { + "epoch": 1.59, + "learning_rate": 0.000187776291305794, + "lm_loss": 0.006591796875, + "loss": 0.0112, + "step": 3890, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001877701371797326, + "lm_loss": 0.00555419921875, + "loss": 0.0079, + "step": 3891, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018776398160578773, + "lm_loss": 0.005035400390625, + "loss": 0.0074, + "step": 3892, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001877578245840609, + "lm_loss": 0.0031280517578125, + "loss": 0.0064, + "step": 3893, + "total_loss": 0.0031280517578125 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001877516661146537, + "lm_loss": 0.0142822265625, + "loss": 0.0073, + "step": 3894, + "total_loss": 0.0142822265625 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001877455061976677, + "lm_loss": 0.01544189453125, + "loss": 0.007, + "step": 3895, + "total_loss": 0.01544189453125 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018773934483320452, + "lm_loss": 0.00096893310546875, + "loss": 0.0061, + "step": 3896, + "total_loss": 0.00096893310546875 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018773318202136582, + "lm_loss": 0.004119873046875, + "loss": 0.0092, + "step": 3897, + "total_loss": 0.004119873046875 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018772701776225326, + "lm_loss": 0.007354736328125, + "loss": 0.0085, + "step": 3898, + "total_loss": 0.007354736328125 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001877208520559685, + "lm_loss": 0.0091552734375, + "loss": 0.0068, + "step": 3899, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001877146849026133, + "lm_loss": 0.00531005859375, + "loss": 0.0067, + "step": 3900, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.59, + "eval_lm_loss": 0.009079032577574253, + "eval_loss": 0.009434985928237438, + "eval_runtime": 43.9242, + "eval_samples_per_second": 22.766, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009079032577574253, + "lm_loss": 0.00119781494140625, + "step": 3900, + "total_loss": 0.00119781494140625 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001877085163022893, + "lm_loss": 0.0087890625, + "loss": 0.0084, + "step": 3901, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001877023462550984, + "lm_loss": 0.006866455078125, + "loss": 0.0059, + "step": 3902, + "total_loss": 0.006866455078125 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018769617476114227, + "lm_loss": 0.0087890625, + "loss": 0.0089, + "step": 3903, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018769000182052278, + "lm_loss": 0.003509521484375, + "loss": 0.0068, + "step": 3904, + "total_loss": 0.003509521484375 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018768382743334172, + "lm_loss": 0.00848388671875, + "loss": 0.0072, + "step": 3905, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018767765159970095, + "lm_loss": 0.01318359375, + "loss": 0.0075, + "step": 3906, + "total_loss": 0.01318359375 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018767147431970237, + "lm_loss": 0.00909423828125, + "loss": 0.0073, + "step": 3907, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018766529559344788, + "lm_loss": 0.0059814453125, + "loss": 0.0072, + "step": 3908, + "total_loss": 0.0059814453125 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018765911542103938, + "lm_loss": 0.004486083984375, + "loss": 0.0059, + "step": 3909, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018765293380257884, + "lm_loss": 0.01116943359375, + "loss": 0.0073, + "step": 3910, + "total_loss": 0.01116943359375 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018764675073816824, + "lm_loss": 0.0147705078125, + "loss": 0.007, + "step": 3911, + "total_loss": 0.0147705078125 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018764056622790957, + "lm_loss": 0.003875732421875, + "loss": 0.0081, + "step": 3912, + "total_loss": 0.003875732421875 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018763438027190484, + "lm_loss": 0.00811767578125, + "loss": 0.0082, + "step": 3913, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018762819287025613, + "lm_loss": 0.007080078125, + "loss": 0.0088, + "step": 3914, + "total_loss": 0.007080078125 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018762200402306546, + "lm_loss": 0.0020751953125, + "loss": 0.0063, + "step": 3915, + "total_loss": 0.0020751953125 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018761581373043495, + "lm_loss": 0.0032958984375, + "loss": 0.0079, + "step": 3916, + "total_loss": 0.0032958984375 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018760962199246674, + "lm_loss": 0.0091552734375, + "loss": 0.0082, + "step": 3917, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018760342880926295, + "lm_loss": 0.0030517578125, + "loss": 0.0072, + "step": 3918, + "total_loss": 0.0030517578125 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001875972341809257, + "lm_loss": 0.0052490234375, + "loss": 0.0094, + "step": 3919, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018759103810755726, + "lm_loss": 0.00970458984375, + "loss": 0.0075, + "step": 3920, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001875848405892598, + "lm_loss": 0.0093994140625, + "loss": 0.0074, + "step": 3921, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001875786416261355, + "lm_loss": 0.0048828125, + "loss": 0.0051, + "step": 3922, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018757244121828673, + "lm_loss": 0.01300048828125, + "loss": 0.0083, + "step": 3923, + "total_loss": 0.01300048828125 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018756623936581573, + "lm_loss": 0.00537109375, + "loss": 0.0085, + "step": 3924, + "total_loss": 0.00537109375 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018756003606882477, + "lm_loss": 0.01458740234375, + "loss": 0.0097, + "step": 3925, + "total_loss": 0.01458740234375 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018755383132741622, + "lm_loss": 0.0048828125, + "loss": 0.0061, + "step": 3926, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001875476251416924, + "lm_loss": 0.00360107421875, + "loss": 0.007, + "step": 3927, + "total_loss": 0.00360107421875 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018754141751175577, + "lm_loss": 0.006134033203125, + "loss": 0.0072, + "step": 3928, + "total_loss": 0.006134033203125 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018753520843770867, + "lm_loss": 0.0150146484375, + "loss": 0.0069, + "step": 3929, + "total_loss": 0.0150146484375 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001875289979196535, + "lm_loss": 0.01214599609375, + "loss": 0.0075, + "step": 3930, + "total_loss": 0.01214599609375 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018752278595769278, + "lm_loss": 0.0084228515625, + "loss": 0.009, + "step": 3931, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018751657255192895, + "lm_loss": 0.0093994140625, + "loss": 0.0091, + "step": 3932, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001875103577024645, + "lm_loss": 0.0079345703125, + "loss": 0.0077, + "step": 3933, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018750414140940197, + "lm_loss": 0.00347900390625, + "loss": 0.006, + "step": 3934, + "total_loss": 0.00347900390625 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018749792367284387, + "lm_loss": 0.005706787109375, + "loss": 0.0097, + "step": 3935, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001874917044928928, + "lm_loss": 0.01123046875, + "loss": 0.0076, + "step": 3936, + "total_loss": 0.01123046875 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018748548386965136, + "lm_loss": 0.003509521484375, + "loss": 0.0057, + "step": 3937, + "total_loss": 0.003509521484375 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018747926180322217, + "lm_loss": 0.0133056640625, + "loss": 0.0089, + "step": 3938, + "total_loss": 0.0133056640625 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018747303829370787, + "lm_loss": 0.007537841796875, + "loss": 0.0072, + "step": 3939, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018746681334121107, + "lm_loss": 0.004852294921875, + "loss": 0.0102, + "step": 3940, + "total_loss": 0.004852294921875 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018746058694583452, + "lm_loss": 0.0079345703125, + "loss": 0.0077, + "step": 3941, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018745435910768094, + "lm_loss": 0.00299072265625, + "loss": 0.0095, + "step": 3942, + "total_loss": 0.00299072265625 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018744812982685302, + "lm_loss": 0.004547119140625, + "loss": 0.008, + "step": 3943, + "total_loss": 0.004547119140625 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018744189910345352, + "lm_loss": 0.00665283203125, + "loss": 0.0082, + "step": 3944, + "total_loss": 0.00665283203125 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018743566693758527, + "lm_loss": 0.003997802734375, + "loss": 0.0076, + "step": 3945, + "total_loss": 0.003997802734375 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018742943332935105, + "lm_loss": 0.01007080078125, + "loss": 0.0083, + "step": 3946, + "total_loss": 0.01007080078125 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018742319827885371, + "lm_loss": 0.005340576171875, + "loss": 0.0068, + "step": 3947, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018741696178619608, + "lm_loss": 0.007476806640625, + "loss": 0.0057, + "step": 3948, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018741072385148103, + "lm_loss": 0.0079345703125, + "loss": 0.0071, + "step": 3949, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018740448447481152, + "lm_loss": 0.005279541015625, + "loss": 0.007, + "step": 3950, + "total_loss": 0.005279541015625 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001873982436562904, + "lm_loss": 0.00154876708984375, + "loss": 0.0079, + "step": 3951, + "total_loss": 0.00154876708984375 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018739200139602068, + "lm_loss": 0.005279541015625, + "loss": 0.0085, + "step": 3952, + "total_loss": 0.005279541015625 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001873857576941053, + "lm_loss": 0.00640869140625, + "loss": 0.0068, + "step": 3953, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001873795125506473, + "lm_loss": 0.0031890869140625, + "loss": 0.0061, + "step": 3954, + "total_loss": 0.0031890869140625 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018737326596574965, + "lm_loss": 0.0023651123046875, + "loss": 0.0068, + "step": 3955, + "total_loss": 0.0023651123046875 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001873670179395154, + "lm_loss": 0.00927734375, + "loss": 0.0065, + "step": 3956, + "total_loss": 0.00927734375 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018736076847204768, + "lm_loss": 0.006317138671875, + "loss": 0.0091, + "step": 3957, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018735451756344955, + "lm_loss": 0.00823974609375, + "loss": 0.0093, + "step": 3958, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018734826521382407, + "lm_loss": 0.0091552734375, + "loss": 0.0084, + "step": 3959, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018734201142327445, + "lm_loss": 0.0067138671875, + "loss": 0.0066, + "step": 3960, + "total_loss": 0.0067138671875 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018733575619190383, + "lm_loss": 0.005218505859375, + "loss": 0.0083, + "step": 3961, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018732949951981543, + "lm_loss": 0.0133056640625, + "loss": 0.0084, + "step": 3962, + "total_loss": 0.0133056640625 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018732324140711238, + "lm_loss": 0.004730224609375, + "loss": 0.0062, + "step": 3963, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.62, + "learning_rate": 0.000187316981853898, + "lm_loss": 0.004638671875, + "loss": 0.0067, + "step": 3964, + "total_loss": 0.004638671875 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018731072086027555, + "lm_loss": 0.00787353515625, + "loss": 0.0071, + "step": 3965, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018730445842634824, + "lm_loss": 0.00433349609375, + "loss": 0.008, + "step": 3966, + "total_loss": 0.00433349609375 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018729819455221944, + "lm_loss": 0.002685546875, + "loss": 0.0063, + "step": 3967, + "total_loss": 0.002685546875 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018729192923799245, + "lm_loss": 0.0146484375, + "loss": 0.0074, + "step": 3968, + "total_loss": 0.0146484375 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018728566248377065, + "lm_loss": 0.00787353515625, + "loss": 0.0078, + "step": 3969, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001872793942896574, + "lm_loss": 0.00439453125, + "loss": 0.0058, + "step": 3970, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018727312465575608, + "lm_loss": 0.0101318359375, + "loss": 0.0067, + "step": 3971, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018726685358217018, + "lm_loss": 0.006134033203125, + "loss": 0.0069, + "step": 3972, + "total_loss": 0.006134033203125 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018726058106900307, + "lm_loss": 0.006439208984375, + "loss": 0.0072, + "step": 3973, + "total_loss": 0.006439208984375 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018725430711635828, + "lm_loss": 0.008544921875, + "loss": 0.0075, + "step": 3974, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001872480317243393, + "lm_loss": 0.00830078125, + "loss": 0.0078, + "step": 3975, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018724175489304962, + "lm_loss": 0.00921630859375, + "loss": 0.0064, + "step": 3976, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018723547662259285, + "lm_loss": 0.01007080078125, + "loss": 0.0069, + "step": 3977, + "total_loss": 0.01007080078125 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018722919691307248, + "lm_loss": 0.006744384765625, + "loss": 0.0068, + "step": 3978, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018722291576459216, + "lm_loss": 0.00555419921875, + "loss": 0.0081, + "step": 3979, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018721663317725545, + "lm_loss": 0.00970458984375, + "loss": 0.0082, + "step": 3980, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018721034915116603, + "lm_loss": 0.0057373046875, + "loss": 0.0077, + "step": 3981, + "total_loss": 0.0057373046875 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018720406368642758, + "lm_loss": 0.00970458984375, + "loss": 0.0071, + "step": 3982, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018719777678314375, + "lm_loss": 0.00946044921875, + "loss": 0.0096, + "step": 3983, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018719148844141827, + "lm_loss": 0.0050048828125, + "loss": 0.0072, + "step": 3984, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018718519866135487, + "lm_loss": 0.00848388671875, + "loss": 0.0077, + "step": 3985, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001871789074430573, + "lm_loss": 0.0022125244140625, + "loss": 0.0067, + "step": 3986, + "total_loss": 0.0022125244140625 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018717261478662934, + "lm_loss": 0.0174560546875, + "loss": 0.0084, + "step": 3987, + "total_loss": 0.0174560546875 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001871663206921748, + "lm_loss": 0.0067138671875, + "loss": 0.0078, + "step": 3988, + "total_loss": 0.0067138671875 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018716002515979755, + "lm_loss": 0.00445556640625, + "loss": 0.0071, + "step": 3989, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018715372818960135, + "lm_loss": 0.01300048828125, + "loss": 0.0076, + "step": 3990, + "total_loss": 0.01300048828125 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018714742978169017, + "lm_loss": 0.009033203125, + "loss": 0.0083, + "step": 3991, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018714112993616783, + "lm_loss": 0.0081787109375, + "loss": 0.0069, + "step": 3992, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018713482865313834, + "lm_loss": 0.00860595703125, + "loss": 0.0084, + "step": 3993, + "total_loss": 0.00860595703125 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001871285259327056, + "lm_loss": 0.00994873046875, + "loss": 0.0072, + "step": 3994, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001871222217749736, + "lm_loss": 0.00372314453125, + "loss": 0.006, + "step": 3995, + "total_loss": 0.00372314453125 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018711591618004628, + "lm_loss": 0.0125732421875, + "loss": 0.007, + "step": 3996, + "total_loss": 0.0125732421875 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018710960914802773, + "lm_loss": 0.01104736328125, + "loss": 0.008, + "step": 3997, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018710330067902195, + "lm_loss": 0.0024261474609375, + "loss": 0.0084, + "step": 3998, + "total_loss": 0.0024261474609375 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018709699077313304, + "lm_loss": 0.0103759765625, + "loss": 0.0074, + "step": 3999, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018709067943046507, + "lm_loss": 0.00628662109375, + "loss": 0.0075, + "step": 4000, + "total_loss": 0.00628662109375 + }, + { + "epoch": 1.64, + "eval_lm_loss": 0.009657489135861397, + "eval_loss": 0.010090351104736328, + "eval_runtime": 44.1481, + "eval_samples_per_second": 22.651, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.009657489135861397, + "lm_loss": 0.00113677978515625, + "step": 4000, + "total_loss": 0.00113677978515625 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018708436665112213, + "lm_loss": 0.00335693359375, + "loss": 0.0064, + "step": 4001, + "total_loss": 0.00335693359375 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018707805243520838, + "lm_loss": 0.00994873046875, + "loss": 0.0082, + "step": 4002, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018707173678282802, + "lm_loss": 0.01141357421875, + "loss": 0.0073, + "step": 4003, + "total_loss": 0.01141357421875 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001870654196940852, + "lm_loss": 0.01080322265625, + "loss": 0.007, + "step": 4004, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018705910116908407, + "lm_loss": 0.00445556640625, + "loss": 0.0062, + "step": 4005, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018705278120792896, + "lm_loss": 0.00372314453125, + "loss": 0.0069, + "step": 4006, + "total_loss": 0.00372314453125 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001870464598107241, + "lm_loss": 0.006500244140625, + "loss": 0.0073, + "step": 4007, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001870401369775737, + "lm_loss": 0.00677490234375, + "loss": 0.007, + "step": 4008, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018703381270858218, + "lm_loss": 0.0103759765625, + "loss": 0.009, + "step": 4009, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018702748700385376, + "lm_loss": 0.006591796875, + "loss": 0.0079, + "step": 4010, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018702115986349286, + "lm_loss": 0.00836181640625, + "loss": 0.0059, + "step": 4011, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018701483128760383, + "lm_loss": 0.00811767578125, + "loss": 0.0083, + "step": 4012, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018700850127629105, + "lm_loss": 0.00482177734375, + "loss": 0.0076, + "step": 4013, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018700216982965899, + "lm_loss": 0.004547119140625, + "loss": 0.0071, + "step": 4014, + "total_loss": 0.004547119140625 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018699583694781205, + "lm_loss": 0.00872802734375, + "loss": 0.0101, + "step": 4015, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001869895026308547, + "lm_loss": 0.0034027099609375, + "loss": 0.0067, + "step": 4016, + "total_loss": 0.0034027099609375 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018698316687889148, + "lm_loss": 0.00787353515625, + "loss": 0.0073, + "step": 4017, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018697682969202685, + "lm_loss": 0.00775146484375, + "loss": 0.008, + "step": 4018, + "total_loss": 0.00775146484375 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001869704910703654, + "lm_loss": 0.00799560546875, + "loss": 0.0069, + "step": 4019, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018696415101401165, + "lm_loss": 0.00787353515625, + "loss": 0.0085, + "step": 4020, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018695780952307023, + "lm_loss": 0.00482177734375, + "loss": 0.007, + "step": 4021, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018695146659764567, + "lm_loss": 0.002410888671875, + "loss": 0.008, + "step": 4022, + "total_loss": 0.002410888671875 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018694512223784273, + "lm_loss": 0.0091552734375, + "loss": 0.0076, + "step": 4023, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018693877644376596, + "lm_loss": 0.00390625, + "loss": 0.0074, + "step": 4024, + "total_loss": 0.00390625 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018693242921552012, + "lm_loss": 0.005401611328125, + "loss": 0.0082, + "step": 4025, + "total_loss": 0.005401611328125 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018692608055320985, + "lm_loss": 0.01007080078125, + "loss": 0.009, + "step": 4026, + "total_loss": 0.01007080078125 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001869197304569399, + "lm_loss": 0.0048828125, + "loss": 0.0074, + "step": 4027, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018691337892681506, + "lm_loss": 0.01055908203125, + "loss": 0.0087, + "step": 4028, + "total_loss": 0.01055908203125 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018690702596294007, + "lm_loss": 0.003326416015625, + "loss": 0.0082, + "step": 4029, + "total_loss": 0.003326416015625 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018690067156541975, + "lm_loss": 0.00689697265625, + "loss": 0.0075, + "step": 4030, + "total_loss": 0.00689697265625 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018689431573435888, + "lm_loss": 0.0078125, + "loss": 0.0067, + "step": 4031, + "total_loss": 0.0078125 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018688795846986234, + "lm_loss": 0.0091552734375, + "loss": 0.0088, + "step": 4032, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018688159977203506, + "lm_loss": 0.01080322265625, + "loss": 0.009, + "step": 4033, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018687523964098184, + "lm_loss": 0.00701904296875, + "loss": 0.0056, + "step": 4034, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001868688780768076, + "lm_loss": 0.01031494140625, + "loss": 0.0079, + "step": 4035, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018686251507961737, + "lm_loss": 0.007293701171875, + "loss": 0.0075, + "step": 4036, + "total_loss": 0.007293701171875 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018685615064951608, + "lm_loss": 0.01123046875, + "loss": 0.0072, + "step": 4037, + "total_loss": 0.01123046875 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018684978478660864, + "lm_loss": 0.005157470703125, + "loss": 0.0081, + "step": 4038, + "total_loss": 0.005157470703125 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018684341749100017, + "lm_loss": 0.005950927734375, + "loss": 0.0088, + "step": 4039, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018683704876279567, + "lm_loss": 0.01251220703125, + "loss": 0.0079, + "step": 4040, + "total_loss": 0.01251220703125 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018683067860210018, + "lm_loss": 0.005340576171875, + "loss": 0.0066, + "step": 4041, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018682430700901878, + "lm_loss": 0.00262451171875, + "loss": 0.0074, + "step": 4042, + "total_loss": 0.00262451171875 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018681793398365662, + "lm_loss": 0.006683349609375, + "loss": 0.006, + "step": 4043, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018681155952611882, + "lm_loss": 0.01202392578125, + "loss": 0.0076, + "step": 4044, + "total_loss": 0.01202392578125 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018680518363651052, + "lm_loss": 0.005584716796875, + "loss": 0.0088, + "step": 4045, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001867988063149369, + "lm_loss": 0.004180908203125, + "loss": 0.0055, + "step": 4046, + "total_loss": 0.004180908203125 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018679242756150313, + "lm_loss": 0.007476806640625, + "loss": 0.0062, + "step": 4047, + "total_loss": 0.007476806640625 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001867860473763145, + "lm_loss": 0.01068115234375, + "loss": 0.0092, + "step": 4048, + "total_loss": 0.01068115234375 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001867796657594762, + "lm_loss": 0.01123046875, + "loss": 0.0078, + "step": 4049, + "total_loss": 0.01123046875 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018677328271109357, + "lm_loss": 0.00830078125, + "loss": 0.0093, + "step": 4050, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018676689823127185, + "lm_loss": 0.0098876953125, + "loss": 0.0077, + "step": 4051, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001867605123201164, + "lm_loss": 0.0038909912109375, + "loss": 0.0074, + "step": 4052, + "total_loss": 0.0038909912109375 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018675412497773253, + "lm_loss": 0.005157470703125, + "loss": 0.0076, + "step": 4053, + "total_loss": 0.005157470703125 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018674773620422564, + "lm_loss": 0.0078125, + "loss": 0.0061, + "step": 4054, + "total_loss": 0.0078125 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001867413459997011, + "lm_loss": 0.009521484375, + "loss": 0.0092, + "step": 4055, + "total_loss": 0.009521484375 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001867349543642643, + "lm_loss": 0.0038909912109375, + "loss": 0.0074, + "step": 4056, + "total_loss": 0.0038909912109375 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001867285612980207, + "lm_loss": 0.01324462890625, + "loss": 0.0066, + "step": 4057, + "total_loss": 0.01324462890625 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018672216680107578, + "lm_loss": 0.00726318359375, + "loss": 0.0077, + "step": 4058, + "total_loss": 0.00726318359375 + }, + { + "epoch": 1.66, + "learning_rate": 0.000186715770873535, + "lm_loss": 0.0111083984375, + "loss": 0.0093, + "step": 4059, + "total_loss": 0.0111083984375 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018670937351550392, + "lm_loss": 0.00830078125, + "loss": 0.0075, + "step": 4060, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.66, + "learning_rate": 0.000186702974727088, + "lm_loss": 0.004669189453125, + "loss": 0.0075, + "step": 4061, + "total_loss": 0.004669189453125 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018669657450839287, + "lm_loss": 0.00494384765625, + "loss": 0.0069, + "step": 4062, + "total_loss": 0.00494384765625 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018669017285952405, + "lm_loss": 0.0084228515625, + "loss": 0.0065, + "step": 4063, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018668376978058716, + "lm_loss": 0.0018768310546875, + "loss": 0.0056, + "step": 4064, + "total_loss": 0.0018768310546875 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018667736527168784, + "lm_loss": 0.006103515625, + "loss": 0.006, + "step": 4065, + "total_loss": 0.006103515625 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018667095933293175, + "lm_loss": 0.005645751953125, + "loss": 0.0073, + "step": 4066, + "total_loss": 0.005645751953125 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001866645519644245, + "lm_loss": 0.0050048828125, + "loss": 0.0062, + "step": 4067, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018665814316627192, + "lm_loss": 0.00567626953125, + "loss": 0.0078, + "step": 4068, + "total_loss": 0.00567626953125 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018665173293857956, + "lm_loss": 0.0033416748046875, + "loss": 0.0075, + "step": 4069, + "total_loss": 0.0033416748046875 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018664532128145328, + "lm_loss": 0.004364013671875, + "loss": 0.0082, + "step": 4070, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018663890819499884, + "lm_loss": 0.01177978515625, + "loss": 0.0085, + "step": 4071, + "total_loss": 0.01177978515625 + }, + { + "epoch": 1.66, + "learning_rate": 0.000186632493679322, + "lm_loss": 0.0098876953125, + "loss": 0.0076, + "step": 4072, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001866260777345286, + "lm_loss": 0.00860595703125, + "loss": 0.0077, + "step": 4073, + "total_loss": 0.00860595703125 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018661966036072445, + "lm_loss": 0.012939453125, + "loss": 0.0066, + "step": 4074, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018661324155801542, + "lm_loss": 0.0091552734375, + "loss": 0.0094, + "step": 4075, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018660682132650745, + "lm_loss": 0.0086669921875, + "loss": 0.0062, + "step": 4076, + "total_loss": 0.0086669921875 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018660039966630638, + "lm_loss": 0.007171630859375, + "loss": 0.0067, + "step": 4077, + "total_loss": 0.007171630859375 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001865939765775182, + "lm_loss": 0.005859375, + "loss": 0.0062, + "step": 4078, + "total_loss": 0.005859375 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001865875520602488, + "lm_loss": 0.006561279296875, + "loss": 0.0065, + "step": 4079, + "total_loss": 0.006561279296875 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001865811261146042, + "lm_loss": 0.006317138671875, + "loss": 0.0067, + "step": 4080, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018657469874069045, + "lm_loss": 0.0087890625, + "loss": 0.0077, + "step": 4081, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001865682699386135, + "lm_loss": 0.00921630859375, + "loss": 0.0083, + "step": 4082, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018656183970847944, + "lm_loss": 0.00396728515625, + "loss": 0.0077, + "step": 4083, + "total_loss": 0.00396728515625 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018655540805039434, + "lm_loss": 0.005462646484375, + "loss": 0.0059, + "step": 4084, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001865489749644643, + "lm_loss": 0.005706787109375, + "loss": 0.0062, + "step": 4085, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018654254045079544, + "lm_loss": 0.0025177001953125, + "loss": 0.0066, + "step": 4086, + "total_loss": 0.0025177001953125 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001865361045094939, + "lm_loss": 0.007568359375, + "loss": 0.0074, + "step": 4087, + "total_loss": 0.007568359375 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018652966714066585, + "lm_loss": 0.008056640625, + "loss": 0.0074, + "step": 4088, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.67, + "learning_rate": 0.0001865232283444175, + "lm_loss": 0.003997802734375, + "loss": 0.0063, + "step": 4089, + "total_loss": 0.003997802734375 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018651678812085505, + "lm_loss": 0.01153564453125, + "loss": 0.0074, + "step": 4090, + "total_loss": 0.01153564453125 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018651034647008475, + "lm_loss": 0.0157470703125, + "loss": 0.0073, + "step": 4091, + "total_loss": 0.0157470703125 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018650390339221284, + "lm_loss": 0.013671875, + "loss": 0.0088, + "step": 4092, + "total_loss": 0.013671875 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018649745888734564, + "lm_loss": 0.006256103515625, + "loss": 0.0078, + "step": 4093, + "total_loss": 0.006256103515625 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018649101295558943, + "lm_loss": 0.0030364990234375, + "loss": 0.0071, + "step": 4094, + "total_loss": 0.0030364990234375 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018648456559705058, + "lm_loss": 0.004425048828125, + "loss": 0.0072, + "step": 4095, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018647811681183542, + "lm_loss": 0.00897216796875, + "loss": 0.0094, + "step": 4096, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018647166660005034, + "lm_loss": 0.0089111328125, + "loss": 0.0074, + "step": 4097, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018646521496180172, + "lm_loss": 0.00787353515625, + "loss": 0.0065, + "step": 4098, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018645876189719602, + "lm_loss": 0.0030975341796875, + "loss": 0.0077, + "step": 4099, + "total_loss": 0.0030975341796875 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001864523074063397, + "lm_loss": 0.0107421875, + "loss": 0.0079, + "step": 4100, + "total_loss": 0.0107421875 + }, + { + "epoch": 1.68, + "eval_lm_loss": 0.009542959742248058, + "eval_loss": 0.00987449660897255, + "eval_runtime": 43.9699, + "eval_samples_per_second": 22.743, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009542959742248058, + "lm_loss": 0.00131988525390625, + "step": 4100, + "total_loss": 0.00131988525390625 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018644585148933922, + "lm_loss": 0.005523681640625, + "loss": 0.0067, + "step": 4101, + "total_loss": 0.005523681640625 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018643939414630104, + "lm_loss": 0.00970458984375, + "loss": 0.0075, + "step": 4102, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018643293537733176, + "lm_loss": 0.004058837890625, + "loss": 0.0065, + "step": 4103, + "total_loss": 0.004058837890625 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018642647518253788, + "lm_loss": 0.00518798828125, + "loss": 0.0077, + "step": 4104, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018642001356202595, + "lm_loss": 0.01092529296875, + "loss": 0.0083, + "step": 4105, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001864135505159026, + "lm_loss": 0.0091552734375, + "loss": 0.0077, + "step": 4106, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018640708604427444, + "lm_loss": 0.0018310546875, + "loss": 0.0061, + "step": 4107, + "total_loss": 0.0018310546875 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018640062014724812, + "lm_loss": 0.008056640625, + "loss": 0.0071, + "step": 4108, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018639415282493025, + "lm_loss": 0.006439208984375, + "loss": 0.0071, + "step": 4109, + "total_loss": 0.006439208984375 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018638768407742756, + "lm_loss": 0.007232666015625, + "loss": 0.0082, + "step": 4110, + "total_loss": 0.007232666015625 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018638121390484677, + "lm_loss": 0.0033416748046875, + "loss": 0.0071, + "step": 4111, + "total_loss": 0.0033416748046875 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018637474230729463, + "lm_loss": 0.00445556640625, + "loss": 0.0077, + "step": 4112, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001863682692848778, + "lm_loss": 0.002197265625, + "loss": 0.0074, + "step": 4113, + "total_loss": 0.002197265625 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018636179483770316, + "lm_loss": 0.01251220703125, + "loss": 0.0088, + "step": 4114, + "total_loss": 0.01251220703125 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018635531896587747, + "lm_loss": 0.00738525390625, + "loss": 0.0074, + "step": 4115, + "total_loss": 0.00738525390625 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018634884166950758, + "lm_loss": 0.006134033203125, + "loss": 0.0067, + "step": 4116, + "total_loss": 0.006134033203125 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018634236294870032, + "lm_loss": 0.0023193359375, + "loss": 0.0058, + "step": 4117, + "total_loss": 0.0023193359375 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001863358828035626, + "lm_loss": 0.0101318359375, + "loss": 0.0081, + "step": 4118, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018632940123420127, + "lm_loss": 0.003448486328125, + "loss": 0.0074, + "step": 4119, + "total_loss": 0.003448486328125 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001863229182407233, + "lm_loss": 0.006988525390625, + "loss": 0.0075, + "step": 4120, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.68, + "learning_rate": 0.0001863164338232356, + "lm_loss": 0.0057373046875, + "loss": 0.0067, + "step": 4121, + "total_loss": 0.0057373046875 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018630994798184513, + "lm_loss": 0.0089111328125, + "loss": 0.0067, + "step": 4122, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018630346071665894, + "lm_loss": 0.01104736328125, + "loss": 0.008, + "step": 4123, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018629697202778397, + "lm_loss": 0.01214599609375, + "loss": 0.0078, + "step": 4124, + "total_loss": 0.01214599609375 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018629048191532734, + "lm_loss": 0.004730224609375, + "loss": 0.0058, + "step": 4125, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018628399037939604, + "lm_loss": 0.0074462890625, + "loss": 0.007, + "step": 4126, + "total_loss": 0.0074462890625 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001862774974200972, + "lm_loss": 0.00506591796875, + "loss": 0.0075, + "step": 4127, + "total_loss": 0.00506591796875 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018627100303753792, + "lm_loss": 0.0037841796875, + "loss": 0.0064, + "step": 4128, + "total_loss": 0.0037841796875 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018626450723182534, + "lm_loss": 0.00634765625, + "loss": 0.0074, + "step": 4129, + "total_loss": 0.00634765625 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001862580100030666, + "lm_loss": 0.0037078857421875, + "loss": 0.0073, + "step": 4130, + "total_loss": 0.0037078857421875 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001862515113513689, + "lm_loss": 0.0101318359375, + "loss": 0.0087, + "step": 4131, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001862450112768394, + "lm_loss": 0.017578125, + "loss": 0.0084, + "step": 4132, + "total_loss": 0.017578125 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018623850977958538, + "lm_loss": 0.00921630859375, + "loss": 0.0053, + "step": 4133, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018623200685971404, + "lm_loss": 0.0150146484375, + "loss": 0.0081, + "step": 4134, + "total_loss": 0.0150146484375 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018622550251733272, + "lm_loss": 0.0130615234375, + "loss": 0.007, + "step": 4135, + "total_loss": 0.0130615234375 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018621899675254867, + "lm_loss": 0.00848388671875, + "loss": 0.006, + "step": 4136, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018621248956546924, + "lm_loss": 0.0108642578125, + "loss": 0.0074, + "step": 4137, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018620598095620176, + "lm_loss": 0.006988525390625, + "loss": 0.009, + "step": 4138, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018619947092485357, + "lm_loss": 0.00579833984375, + "loss": 0.0078, + "step": 4139, + "total_loss": 0.00579833984375 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001861929594715321, + "lm_loss": 0.007568359375, + "loss": 0.0064, + "step": 4140, + "total_loss": 0.007568359375 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018618644659634475, + "lm_loss": 0.00872802734375, + "loss": 0.0078, + "step": 4141, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.69, + "learning_rate": 0.000186179932299399, + "lm_loss": 0.00506591796875, + "loss": 0.0077, + "step": 4142, + "total_loss": 0.00506591796875 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001861734165808022, + "lm_loss": 0.0162353515625, + "loss": 0.0081, + "step": 4143, + "total_loss": 0.0162353515625 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018616689944066196, + "lm_loss": 0.006927490234375, + "loss": 0.0087, + "step": 4144, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018616038087908572, + "lm_loss": 0.00982666015625, + "loss": 0.0074, + "step": 4145, + "total_loss": 0.00982666015625 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018615386089618102, + "lm_loss": 0.006317138671875, + "loss": 0.0066, + "step": 4146, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018614733949205543, + "lm_loss": 0.0035858154296875, + "loss": 0.0074, + "step": 4147, + "total_loss": 0.0035858154296875 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001861408166668165, + "lm_loss": 0.006683349609375, + "loss": 0.0084, + "step": 4148, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001861342924205719, + "lm_loss": 0.0135498046875, + "loss": 0.0077, + "step": 4149, + "total_loss": 0.0135498046875 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001861277667534292, + "lm_loss": 0.0146484375, + "loss": 0.0077, + "step": 4150, + "total_loss": 0.0146484375 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018612123966549606, + "lm_loss": 0.00665283203125, + "loss": 0.0077, + "step": 4151, + "total_loss": 0.00665283203125 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018611471115688013, + "lm_loss": 0.0067138671875, + "loss": 0.0078, + "step": 4152, + "total_loss": 0.0067138671875 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018610818122768915, + "lm_loss": 0.0177001953125, + "loss": 0.0081, + "step": 4153, + "total_loss": 0.0177001953125 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018610164987803085, + "lm_loss": 0.00714111328125, + "loss": 0.0071, + "step": 4154, + "total_loss": 0.00714111328125 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018609511710801288, + "lm_loss": 0.01470947265625, + "loss": 0.0081, + "step": 4155, + "total_loss": 0.01470947265625 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018608858291774313, + "lm_loss": 0.01385498046875, + "loss": 0.0098, + "step": 4156, + "total_loss": 0.01385498046875 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001860820473073293, + "lm_loss": 0.0081787109375, + "loss": 0.0072, + "step": 4157, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018607551027687925, + "lm_loss": 0.01446533203125, + "loss": 0.0087, + "step": 4158, + "total_loss": 0.01446533203125 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001860689718265008, + "lm_loss": 0.003387451171875, + "loss": 0.0068, + "step": 4159, + "total_loss": 0.003387451171875 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001860624319563018, + "lm_loss": 0.004364013671875, + "loss": 0.0066, + "step": 4160, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018605589066639013, + "lm_loss": 0.00384521484375, + "loss": 0.0078, + "step": 4161, + "total_loss": 0.00384521484375 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018604934795687372, + "lm_loss": 0.00787353515625, + "loss": 0.0077, + "step": 4162, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001860428038278605, + "lm_loss": 0.01153564453125, + "loss": 0.008, + "step": 4163, + "total_loss": 0.01153564453125 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018603625827945844, + "lm_loss": 0.0089111328125, + "loss": 0.0072, + "step": 4164, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018602971131177548, + "lm_loss": 0.01092529296875, + "loss": 0.0094, + "step": 4165, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018602316292491963, + "lm_loss": 0.0147705078125, + "loss": 0.0084, + "step": 4166, + "total_loss": 0.0147705078125 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001860166131189989, + "lm_loss": 0.003570556640625, + "loss": 0.0072, + "step": 4167, + "total_loss": 0.003570556640625 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018601006189412136, + "lm_loss": 0.00787353515625, + "loss": 0.0071, + "step": 4168, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001860035092503951, + "lm_loss": 0.01080322265625, + "loss": 0.0073, + "step": 4169, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001859969551879282, + "lm_loss": 0.0091552734375, + "loss": 0.0083, + "step": 4170, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018599039970682872, + "lm_loss": 0.00830078125, + "loss": 0.0078, + "step": 4171, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018598384280720488, + "lm_loss": 0.00775146484375, + "loss": 0.007, + "step": 4172, + "total_loss": 0.00775146484375 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018597728448916482, + "lm_loss": 0.007568359375, + "loss": 0.0071, + "step": 4173, + "total_loss": 0.007568359375 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018597072475281672, + "lm_loss": 0.006744384765625, + "loss": 0.0071, + "step": 4174, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001859641635982688, + "lm_loss": 0.00537109375, + "loss": 0.0063, + "step": 4175, + "total_loss": 0.00537109375 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001859576010256293, + "lm_loss": 0.00946044921875, + "loss": 0.0095, + "step": 4176, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018595103703500645, + "lm_loss": 0.005035400390625, + "loss": 0.007, + "step": 4177, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018594447162650855, + "lm_loss": 0.003204345703125, + "loss": 0.0095, + "step": 4178, + "total_loss": 0.003204345703125 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001859379048002439, + "lm_loss": 0.0054931640625, + "loss": 0.0073, + "step": 4179, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018593133655632086, + "lm_loss": 0.00604248046875, + "loss": 0.0066, + "step": 4180, + "total_loss": 0.00604248046875 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018592476689484775, + "lm_loss": 0.004241943359375, + "loss": 0.0072, + "step": 4181, + "total_loss": 0.004241943359375 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018591819581593292, + "lm_loss": 0.0054931640625, + "loss": 0.0062, + "step": 4182, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018591162331968482, + "lm_loss": 0.006988525390625, + "loss": 0.0085, + "step": 4183, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018590504940621189, + "lm_loss": 0.00531005859375, + "loss": 0.0067, + "step": 4184, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018589847407562247, + "lm_loss": 0.007232666015625, + "loss": 0.0067, + "step": 4185, + "total_loss": 0.007232666015625 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018589189732802513, + "lm_loss": 0.00836181640625, + "loss": 0.0074, + "step": 4186, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018588531916352832, + "lm_loss": 0.0123291015625, + "loss": 0.0081, + "step": 4187, + "total_loss": 0.0123291015625 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018587873958224055, + "lm_loss": 0.0130615234375, + "loss": 0.0072, + "step": 4188, + "total_loss": 0.0130615234375 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018587215858427037, + "lm_loss": 0.00653076171875, + "loss": 0.0077, + "step": 4189, + "total_loss": 0.00653076171875 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001858655761697264, + "lm_loss": 0.00640869140625, + "loss": 0.0075, + "step": 4190, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018585899233871712, + "lm_loss": 0.0167236328125, + "loss": 0.0081, + "step": 4191, + "total_loss": 0.0167236328125 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018585240709135116, + "lm_loss": 0.01080322265625, + "loss": 0.0073, + "step": 4192, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018584582042773722, + "lm_loss": 0.00518798828125, + "loss": 0.0092, + "step": 4193, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001858392323479839, + "lm_loss": 0.0030670166015625, + "loss": 0.0081, + "step": 4194, + "total_loss": 0.0030670166015625 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018583264285219992, + "lm_loss": 0.01416015625, + "loss": 0.0074, + "step": 4195, + "total_loss": 0.01416015625 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018582605194049394, + "lm_loss": 0.0137939453125, + "loss": 0.0076, + "step": 4196, + "total_loss": 0.0137939453125 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001858194596129747, + "lm_loss": 0.006927490234375, + "loss": 0.0071, + "step": 4197, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018581286586975094, + "lm_loss": 0.00885009765625, + "loss": 0.0087, + "step": 4198, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018580627071093146, + "lm_loss": 0.01220703125, + "loss": 0.0076, + "step": 4199, + "total_loss": 0.01220703125 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018579967413662508, + "lm_loss": 0.007659912109375, + "loss": 0.0071, + "step": 4200, + "total_loss": 0.007659912109375 + }, + { + "epoch": 1.72, + "eval_lm_loss": 0.009608560241758823, + "eval_loss": 0.009925203397870064, + "eval_runtime": 44.1406, + "eval_samples_per_second": 22.655, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.009608560241758823, + "lm_loss": 0.0020294189453125, + "step": 4200, + "total_loss": 0.0020294189453125 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001857930761469405, + "lm_loss": 0.0111083984375, + "loss": 0.0096, + "step": 4201, + "total_loss": 0.0111083984375 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001857864767419867, + "lm_loss": 0.00408935546875, + "loss": 0.0063, + "step": 4202, + "total_loss": 0.00408935546875 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018577987592187248, + "lm_loss": 0.006439208984375, + "loss": 0.0085, + "step": 4203, + "total_loss": 0.006439208984375 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018577327368670673, + "lm_loss": 0.00677490234375, + "loss": 0.0063, + "step": 4204, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018576667003659836, + "lm_loss": 0.006500244140625, + "loss": 0.0088, + "step": 4205, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018576006497165632, + "lm_loss": 0.004730224609375, + "loss": 0.0082, + "step": 4206, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018575345849198958, + "lm_loss": 0.006927490234375, + "loss": 0.0057, + "step": 4207, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018574685059770708, + "lm_loss": 0.00579833984375, + "loss": 0.0066, + "step": 4208, + "total_loss": 0.00579833984375 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018574024128891788, + "lm_loss": 0.0047607421875, + "loss": 0.006, + "step": 4209, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018573363056573097, + "lm_loss": 0.006683349609375, + "loss": 0.0067, + "step": 4210, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018572701842825543, + "lm_loss": 0.0016937255859375, + "loss": 0.0068, + "step": 4211, + "total_loss": 0.0016937255859375 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018572040487660027, + "lm_loss": 0.0047607421875, + "loss": 0.0069, + "step": 4212, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001857137899108747, + "lm_loss": 0.005462646484375, + "loss": 0.0048, + "step": 4213, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018570717353118775, + "lm_loss": 0.0159912109375, + "loss": 0.0078, + "step": 4214, + "total_loss": 0.0159912109375 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001857005557376486, + "lm_loss": 0.0048828125, + "loss": 0.0064, + "step": 4215, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018569393653036645, + "lm_loss": 0.0133056640625, + "loss": 0.0067, + "step": 4216, + "total_loss": 0.0133056640625 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001856873159094504, + "lm_loss": 0.0045166015625, + "loss": 0.0053, + "step": 4217, + "total_loss": 0.0045166015625 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018568069387500977, + "lm_loss": 0.006317138671875, + "loss": 0.0075, + "step": 4218, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018567407042715376, + "lm_loss": 0.0093994140625, + "loss": 0.0097, + "step": 4219, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001856674455659916, + "lm_loss": 0.00701904296875, + "loss": 0.0077, + "step": 4220, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018566081929163262, + "lm_loss": 0.006378173828125, + "loss": 0.0091, + "step": 4221, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018565419160418613, + "lm_loss": 0.004486083984375, + "loss": 0.0067, + "step": 4222, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001856475625037614, + "lm_loss": 0.006500244140625, + "loss": 0.0057, + "step": 4223, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018564093199046786, + "lm_loss": 0.01226806640625, + "loss": 0.0061, + "step": 4224, + "total_loss": 0.01226806640625 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018563430006441484, + "lm_loss": 0.00518798828125, + "loss": 0.009, + "step": 4225, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018562766672571178, + "lm_loss": 0.005889892578125, + "loss": 0.0096, + "step": 4226, + "total_loss": 0.005889892578125 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018562103197446807, + "lm_loss": 0.00421142578125, + "loss": 0.0082, + "step": 4227, + "total_loss": 0.00421142578125 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001856143958107932, + "lm_loss": 0.0036468505859375, + "loss": 0.0054, + "step": 4228, + "total_loss": 0.0036468505859375 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001856077582347966, + "lm_loss": 0.0059814453125, + "loss": 0.0084, + "step": 4229, + "total_loss": 0.0059814453125 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018560111924658778, + "lm_loss": 0.007781982421875, + "loss": 0.0066, + "step": 4230, + "total_loss": 0.007781982421875 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018559447884627627, + "lm_loss": 0.00872802734375, + "loss": 0.0088, + "step": 4231, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001855878370339716, + "lm_loss": 0.002166748046875, + "loss": 0.0048, + "step": 4232, + "total_loss": 0.002166748046875 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018558119380978336, + "lm_loss": 0.0033416748046875, + "loss": 0.0066, + "step": 4233, + "total_loss": 0.0033416748046875 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018557454917382112, + "lm_loss": 0.005645751953125, + "loss": 0.0063, + "step": 4234, + "total_loss": 0.005645751953125 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018556790312619444, + "lm_loss": 0.007598876953125, + "loss": 0.0097, + "step": 4235, + "total_loss": 0.007598876953125 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018556125566701308, + "lm_loss": 0.007720947265625, + "loss": 0.0071, + "step": 4236, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018555460679638657, + "lm_loss": 0.00738525390625, + "loss": 0.0085, + "step": 4237, + "total_loss": 0.00738525390625 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018554795651442468, + "lm_loss": 0.00811767578125, + "loss": 0.007, + "step": 4238, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018554130482123705, + "lm_loss": 0.004425048828125, + "loss": 0.0073, + "step": 4239, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018553465171693346, + "lm_loss": 0.006561279296875, + "loss": 0.0077, + "step": 4240, + "total_loss": 0.006561279296875 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018552799720162363, + "lm_loss": 0.0123291015625, + "loss": 0.0063, + "step": 4241, + "total_loss": 0.0123291015625 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018552134127541736, + "lm_loss": 0.0057373046875, + "loss": 0.0067, + "step": 4242, + "total_loss": 0.0057373046875 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018551468393842444, + "lm_loss": 0.01251220703125, + "loss": 0.0083, + "step": 4243, + "total_loss": 0.01251220703125 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018550802519075467, + "lm_loss": 0.0103759765625, + "loss": 0.0073, + "step": 4244, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001855013650325179, + "lm_loss": 0.0074462890625, + "loss": 0.0066, + "step": 4245, + "total_loss": 0.0074462890625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018549470346382405, + "lm_loss": 0.00390625, + "loss": 0.0065, + "step": 4246, + "total_loss": 0.00390625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018548804048478294, + "lm_loss": 0.005126953125, + "loss": 0.0073, + "step": 4247, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018548137609550452, + "lm_loss": 0.0035400390625, + "loss": 0.0081, + "step": 4248, + "total_loss": 0.0035400390625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018547471029609875, + "lm_loss": 0.013671875, + "loss": 0.0077, + "step": 4249, + "total_loss": 0.013671875 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018546804308667552, + "lm_loss": 0.00640869140625, + "loss": 0.0074, + "step": 4250, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018546137446734487, + "lm_loss": 0.0054931640625, + "loss": 0.0069, + "step": 4251, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018545470443821681, + "lm_loss": 0.00787353515625, + "loss": 0.0064, + "step": 4252, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018544803299940137, + "lm_loss": 0.00396728515625, + "loss": 0.0081, + "step": 4253, + "total_loss": 0.00396728515625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018544136015100857, + "lm_loss": 0.0042724609375, + "loss": 0.009, + "step": 4254, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001854346858931485, + "lm_loss": 0.015625, + "loss": 0.0078, + "step": 4255, + "total_loss": 0.015625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018542801022593128, + "lm_loss": 0.006591796875, + "loss": 0.0059, + "step": 4256, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018542133314946704, + "lm_loss": 0.0164794921875, + "loss": 0.0103, + "step": 4257, + "total_loss": 0.0164794921875 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001854146546638659, + "lm_loss": 0.00152587890625, + "loss": 0.0065, + "step": 4258, + "total_loss": 0.00152587890625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018540797476923802, + "lm_loss": 0.004425048828125, + "loss": 0.0076, + "step": 4259, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018540129346569364, + "lm_loss": 0.0034332275390625, + "loss": 0.0065, + "step": 4260, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001853946107533429, + "lm_loss": 0.007354736328125, + "loss": 0.0074, + "step": 4261, + "total_loss": 0.007354736328125 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018538792663229617, + "lm_loss": 0.006988525390625, + "loss": 0.0077, + "step": 4262, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018538124110266358, + "lm_loss": 0.00555419921875, + "loss": 0.006, + "step": 4263, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001853745541645555, + "lm_loss": 0.01373291015625, + "loss": 0.009, + "step": 4264, + "total_loss": 0.01373291015625 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001853678658180822, + "lm_loss": 0.014892578125, + "loss": 0.0092, + "step": 4265, + "total_loss": 0.014892578125 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018536117606335402, + "lm_loss": 0.0079345703125, + "loss": 0.0063, + "step": 4266, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001853544849004813, + "lm_loss": 0.01300048828125, + "loss": 0.008, + "step": 4267, + "total_loss": 0.01300048828125 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018534779232957447, + "lm_loss": 0.0133056640625, + "loss": 0.0071, + "step": 4268, + "total_loss": 0.0133056640625 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001853410983507439, + "lm_loss": 0.002044677734375, + "loss": 0.0083, + "step": 4269, + "total_loss": 0.002044677734375 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018533440296410002, + "lm_loss": 0.01446533203125, + "loss": 0.0088, + "step": 4270, + "total_loss": 0.01446533203125 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018532770616975326, + "lm_loss": 0.007049560546875, + "loss": 0.0077, + "step": 4271, + "total_loss": 0.007049560546875 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018532100796781414, + "lm_loss": 0.00421142578125, + "loss": 0.0076, + "step": 4272, + "total_loss": 0.00421142578125 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018531430835839308, + "lm_loss": 0.012939453125, + "loss": 0.0085, + "step": 4273, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018530760734160066, + "lm_loss": 0.004425048828125, + "loss": 0.0068, + "step": 4274, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018530090491754742, + "lm_loss": 0.005645751953125, + "loss": 0.0064, + "step": 4275, + "total_loss": 0.005645751953125 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001852942010863439, + "lm_loss": 0.018798828125, + "loss": 0.0062, + "step": 4276, + "total_loss": 0.018798828125 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001852874958481007, + "lm_loss": 0.0078125, + "loss": 0.0079, + "step": 4277, + "total_loss": 0.0078125 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018528078920292847, + "lm_loss": 0.0164794921875, + "loss": 0.0081, + "step": 4278, + "total_loss": 0.0164794921875 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018527408115093775, + "lm_loss": 0.005340576171875, + "loss": 0.0082, + "step": 4279, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001852673716922393, + "lm_loss": 0.0078125, + "loss": 0.0078, + "step": 4280, + "total_loss": 0.0078125 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018526066082694373, + "lm_loss": 0.00543212890625, + "loss": 0.0069, + "step": 4281, + "total_loss": 0.00543212890625 + }, + { + "epoch": 1.75, + "learning_rate": 0.0001852539485551618, + "lm_loss": 0.006072998046875, + "loss": 0.0081, + "step": 4282, + "total_loss": 0.006072998046875 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018524723487700414, + "lm_loss": 0.0106201171875, + "loss": 0.0086, + "step": 4283, + "total_loss": 0.0106201171875 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018524051979258163, + "lm_loss": 0.01171875, + "loss": 0.0069, + "step": 4284, + "total_loss": 0.01171875 + }, + { + "epoch": 1.75, + "learning_rate": 0.000185233803302005, + "lm_loss": 0.005218505859375, + "loss": 0.0077, + "step": 4285, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.75, + "learning_rate": 0.000185227085405385, + "lm_loss": 0.0091552734375, + "loss": 0.0072, + "step": 4286, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018522036610283248, + "lm_loss": 0.004150390625, + "loss": 0.0067, + "step": 4287, + "total_loss": 0.004150390625 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018521364539445827, + "lm_loss": 0.004608154296875, + "loss": 0.0066, + "step": 4288, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018520692328037326, + "lm_loss": 0.0047607421875, + "loss": 0.007, + "step": 4289, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018520019976068835, + "lm_loss": 0.00482177734375, + "loss": 0.0085, + "step": 4290, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018519347483551441, + "lm_loss": 0.0029296875, + "loss": 0.0068, + "step": 4291, + "total_loss": 0.0029296875 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018518674850496244, + "lm_loss": 0.0081787109375, + "loss": 0.0064, + "step": 4292, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018518002076914334, + "lm_loss": 0.005126953125, + "loss": 0.0071, + "step": 4293, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001851732916281681, + "lm_loss": 0.0030059814453125, + "loss": 0.0043, + "step": 4294, + "total_loss": 0.0030059814453125 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018516656108214777, + "lm_loss": 0.00360107421875, + "loss": 0.0066, + "step": 4295, + "total_loss": 0.00360107421875 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018515982913119334, + "lm_loss": 0.00927734375, + "loss": 0.0079, + "step": 4296, + "total_loss": 0.00927734375 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018515309577541586, + "lm_loss": 0.00445556640625, + "loss": 0.0066, + "step": 4297, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018514636101492641, + "lm_loss": 0.0062255859375, + "loss": 0.0086, + "step": 4298, + "total_loss": 0.0062255859375 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018513962484983614, + "lm_loss": 0.0118408203125, + "loss": 0.0083, + "step": 4299, + "total_loss": 0.0118408203125 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018513288728025607, + "lm_loss": 0.0108642578125, + "loss": 0.0095, + "step": 4300, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.76, + "eval_lm_loss": 0.009254022501409054, + "eval_loss": 0.009628057479858398, + "eval_runtime": 44.0235, + "eval_samples_per_second": 22.715, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.009254022501409054, + "lm_loss": 0.000827789306640625, + "step": 4300, + "total_loss": 0.000827789306640625 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018512614830629745, + "lm_loss": 0.007415771484375, + "loss": 0.0065, + "step": 4301, + "total_loss": 0.007415771484375 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018511940792807137, + "lm_loss": 0.00836181640625, + "loss": 0.0067, + "step": 4302, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018511266614568904, + "lm_loss": 0.007110595703125, + "loss": 0.0078, + "step": 4303, + "total_loss": 0.007110595703125 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018510592295926172, + "lm_loss": 0.0042724609375, + "loss": 0.0071, + "step": 4304, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018509917836890063, + "lm_loss": 0.00787353515625, + "loss": 0.0069, + "step": 4305, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018509243237471696, + "lm_loss": 0.00408935546875, + "loss": 0.0069, + "step": 4306, + "total_loss": 0.00408935546875 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001850856849768221, + "lm_loss": 0.008544921875, + "loss": 0.0071, + "step": 4307, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001850789361753273, + "lm_loss": 0.005401611328125, + "loss": 0.0072, + "step": 4308, + "total_loss": 0.005401611328125 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018507218597034388, + "lm_loss": 0.0078125, + "loss": 0.0054, + "step": 4309, + "total_loss": 0.0078125 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001850654343619832, + "lm_loss": 0.005767822265625, + "loss": 0.0087, + "step": 4310, + "total_loss": 0.005767822265625 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018505868135035666, + "lm_loss": 0.01348876953125, + "loss": 0.0066, + "step": 4311, + "total_loss": 0.01348876953125 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018505192693557567, + "lm_loss": 0.00640869140625, + "loss": 0.0073, + "step": 4312, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018504517111775162, + "lm_loss": 0.004730224609375, + "loss": 0.0068, + "step": 4313, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001850384138969959, + "lm_loss": 0.0038299560546875, + "loss": 0.0065, + "step": 4314, + "total_loss": 0.0038299560546875 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018503165527342016, + "lm_loss": 0.01263427734375, + "loss": 0.0084, + "step": 4315, + "total_loss": 0.01263427734375 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001850248952471357, + "lm_loss": 0.0036163330078125, + "loss": 0.0084, + "step": 4316, + "total_loss": 0.0036163330078125 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018501813381825412, + "lm_loss": 0.00531005859375, + "loss": 0.0077, + "step": 4317, + "total_loss": 0.00531005859375 + }, + { + "epoch": 1.77, + "learning_rate": 0.000185011370986887, + "lm_loss": 0.005462646484375, + "loss": 0.0074, + "step": 4318, + "total_loss": 0.005462646484375 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018500460675314578, + "lm_loss": 0.0091552734375, + "loss": 0.0079, + "step": 4319, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018499784111714217, + "lm_loss": 0.007537841796875, + "loss": 0.0074, + "step": 4320, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001849910740789877, + "lm_loss": 0.0047607421875, + "loss": 0.0088, + "step": 4321, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018498430563879405, + "lm_loss": 0.011962890625, + "loss": 0.0078, + "step": 4322, + "total_loss": 0.011962890625 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018497753579667286, + "lm_loss": 0.00213623046875, + "loss": 0.0072, + "step": 4323, + "total_loss": 0.00213623046875 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018497076455273578, + "lm_loss": 0.0093994140625, + "loss": 0.007, + "step": 4324, + "total_loss": 0.0093994140625 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018496399190709453, + "lm_loss": 0.007659912109375, + "loss": 0.0072, + "step": 4325, + "total_loss": 0.007659912109375 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018495721785986082, + "lm_loss": 0.0035400390625, + "loss": 0.0064, + "step": 4326, + "total_loss": 0.0035400390625 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018495044241114642, + "lm_loss": 0.005035400390625, + "loss": 0.0069, + "step": 4327, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001849436655610631, + "lm_loss": 0.0137939453125, + "loss": 0.0083, + "step": 4328, + "total_loss": 0.0137939453125 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018493688730972262, + "lm_loss": 0.004119873046875, + "loss": 0.0067, + "step": 4329, + "total_loss": 0.004119873046875 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018493010765723686, + "lm_loss": 0.005645751953125, + "loss": 0.0087, + "step": 4330, + "total_loss": 0.005645751953125 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018492332660371757, + "lm_loss": 0.0062255859375, + "loss": 0.007, + "step": 4331, + "total_loss": 0.0062255859375 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001849165441492767, + "lm_loss": 0.008544921875, + "loss": 0.0068, + "step": 4332, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001849097602940261, + "lm_loss": 0.005340576171875, + "loss": 0.0068, + "step": 4333, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018490297503807765, + "lm_loss": 0.007293701171875, + "loss": 0.0053, + "step": 4334, + "total_loss": 0.007293701171875 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018489618838154332, + "lm_loss": 0.0108642578125, + "loss": 0.0069, + "step": 4335, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018488940032453506, + "lm_loss": 0.010986328125, + "loss": 0.0089, + "step": 4336, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001848826108671648, + "lm_loss": 0.003387451171875, + "loss": 0.0068, + "step": 4337, + "total_loss": 0.003387451171875 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018487582000954462, + "lm_loss": 0.01300048828125, + "loss": 0.0083, + "step": 4338, + "total_loss": 0.01300048828125 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001848690277517865, + "lm_loss": 0.007171630859375, + "loss": 0.0078, + "step": 4339, + "total_loss": 0.007171630859375 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001848622340940025, + "lm_loss": 0.005706787109375, + "loss": 0.0064, + "step": 4340, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018485543903630464, + "lm_loss": 0.01165771484375, + "loss": 0.0085, + "step": 4341, + "total_loss": 0.01165771484375 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001848486425788051, + "lm_loss": 0.00872802734375, + "loss": 0.0089, + "step": 4342, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018484184472161594, + "lm_loss": 0.0103759765625, + "loss": 0.0075, + "step": 4343, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001848350454648493, + "lm_loss": 0.0021209716796875, + "loss": 0.0054, + "step": 4344, + "total_loss": 0.0021209716796875 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001848282448086174, + "lm_loss": 0.00628662109375, + "loss": 0.0074, + "step": 4345, + "total_loss": 0.00628662109375 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018482144275303231, + "lm_loss": 0.0087890625, + "loss": 0.0056, + "step": 4346, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018481463929820633, + "lm_loss": 0.006011962890625, + "loss": 0.0073, + "step": 4347, + "total_loss": 0.006011962890625 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001848078344442517, + "lm_loss": 0.005615234375, + "loss": 0.0064, + "step": 4348, + "total_loss": 0.005615234375 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001848010281912806, + "lm_loss": 0.00177001953125, + "loss": 0.006, + "step": 4349, + "total_loss": 0.00177001953125 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001847942205394054, + "lm_loss": 0.004425048828125, + "loss": 0.0074, + "step": 4350, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018478741148873832, + "lm_loss": 0.00836181640625, + "loss": 0.0081, + "step": 4351, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001847806010393917, + "lm_loss": 0.00457763671875, + "loss": 0.0074, + "step": 4352, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018477378919147798, + "lm_loss": 0.009033203125, + "loss": 0.0085, + "step": 4353, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018476697594510942, + "lm_loss": 0.007415771484375, + "loss": 0.0071, + "step": 4354, + "total_loss": 0.007415771484375 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018476016130039843, + "lm_loss": 0.004180908203125, + "loss": 0.0051, + "step": 4355, + "total_loss": 0.004180908203125 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018475334525745745, + "lm_loss": 0.013671875, + "loss": 0.0087, + "step": 4356, + "total_loss": 0.013671875 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018474652781639894, + "lm_loss": 0.0096435546875, + "loss": 0.0073, + "step": 4357, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018473970897733532, + "lm_loss": 0.01214599609375, + "loss": 0.0085, + "step": 4358, + "total_loss": 0.01214599609375 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018473288874037912, + "lm_loss": 0.0130615234375, + "loss": 0.0073, + "step": 4359, + "total_loss": 0.0130615234375 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001847260671056428, + "lm_loss": 0.004486083984375, + "loss": 0.0084, + "step": 4360, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018471924407323893, + "lm_loss": 0.005584716796875, + "loss": 0.0094, + "step": 4361, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018471241964328002, + "lm_loss": 0.007354736328125, + "loss": 0.0072, + "step": 4362, + "total_loss": 0.007354736328125 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018470559381587873, + "lm_loss": 0.01318359375, + "loss": 0.0084, + "step": 4363, + "total_loss": 0.01318359375 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001846987665911476, + "lm_loss": 0.007659912109375, + "loss": 0.007, + "step": 4364, + "total_loss": 0.007659912109375 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001846919379691992, + "lm_loss": 0.00933837890625, + "loss": 0.0077, + "step": 4365, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001846851079501463, + "lm_loss": 0.0042724609375, + "loss": 0.007, + "step": 4366, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018467827653410147, + "lm_loss": 0.002838134765625, + "loss": 0.0073, + "step": 4367, + "total_loss": 0.002838134765625 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018467144372117747, + "lm_loss": 0.00885009765625, + "loss": 0.0072, + "step": 4368, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.79, + "learning_rate": 0.000184664609511487, + "lm_loss": 0.00848388671875, + "loss": 0.0066, + "step": 4369, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018465777390514276, + "lm_loss": 0.00390625, + "loss": 0.0081, + "step": 4370, + "total_loss": 0.00390625 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018465093690225756, + "lm_loss": 0.00457763671875, + "loss": 0.0089, + "step": 4371, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018464409850294414, + "lm_loss": 0.0081787109375, + "loss": 0.007, + "step": 4372, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001846372587073154, + "lm_loss": 0.00701904296875, + "loss": 0.0065, + "step": 4373, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018463041751548404, + "lm_loss": 0.00439453125, + "loss": 0.0073, + "step": 4374, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018462357492756302, + "lm_loss": 0.005584716796875, + "loss": 0.0064, + "step": 4375, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018461673094366513, + "lm_loss": 0.0174560546875, + "loss": 0.0075, + "step": 4376, + "total_loss": 0.0174560546875 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018460988556390334, + "lm_loss": 0.0191650390625, + "loss": 0.0097, + "step": 4377, + "total_loss": 0.0191650390625 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018460303878839058, + "lm_loss": 0.003692626953125, + "loss": 0.0086, + "step": 4378, + "total_loss": 0.003692626953125 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018459619061723977, + "lm_loss": 0.0038604736328125, + "loss": 0.0069, + "step": 4379, + "total_loss": 0.0038604736328125 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018458934105056383, + "lm_loss": 0.00689697265625, + "loss": 0.007, + "step": 4380, + "total_loss": 0.00689697265625 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018458249008847583, + "lm_loss": 0.01165771484375, + "loss": 0.0089, + "step": 4381, + "total_loss": 0.01165771484375 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018457563773108878, + "lm_loss": 0.00421142578125, + "loss": 0.0068, + "step": 4382, + "total_loss": 0.00421142578125 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018456878397851565, + "lm_loss": 0.01092529296875, + "loss": 0.0073, + "step": 4383, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018456192883086958, + "lm_loss": 0.01263427734375, + "loss": 0.0098, + "step": 4384, + "total_loss": 0.01263427734375 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018455507228826358, + "lm_loss": 0.00299072265625, + "loss": 0.0058, + "step": 4385, + "total_loss": 0.00299072265625 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018454821435081082, + "lm_loss": 0.00439453125, + "loss": 0.0079, + "step": 4386, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018454135501862445, + "lm_loss": 0.01708984375, + "loss": 0.0079, + "step": 4387, + "total_loss": 0.01708984375 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018453449429181752, + "lm_loss": 0.00628662109375, + "loss": 0.0074, + "step": 4388, + "total_loss": 0.00628662109375 + }, + { + "epoch": 1.79, + "learning_rate": 0.0001845276321705033, + "lm_loss": 0.013916015625, + "loss": 0.0082, + "step": 4389, + "total_loss": 0.013916015625 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018452076865479495, + "lm_loss": 0.00390625, + "loss": 0.0068, + "step": 4390, + "total_loss": 0.00390625 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018451390374480572, + "lm_loss": 0.0052490234375, + "loss": 0.0075, + "step": 4391, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018450703744064882, + "lm_loss": 0.004791259765625, + "loss": 0.0071, + "step": 4392, + "total_loss": 0.004791259765625 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018450016974243757, + "lm_loss": 0.010498046875, + "loss": 0.0069, + "step": 4393, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001844933006502852, + "lm_loss": 0.005279541015625, + "loss": 0.0079, + "step": 4394, + "total_loss": 0.005279541015625 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018448643016430504, + "lm_loss": 0.005340576171875, + "loss": 0.0069, + "step": 4395, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018447955828461047, + "lm_loss": 0.0068359375, + "loss": 0.0066, + "step": 4396, + "total_loss": 0.0068359375 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001844726850113148, + "lm_loss": 0.0194091796875, + "loss": 0.0076, + "step": 4397, + "total_loss": 0.0194091796875 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018446581034453144, + "lm_loss": 0.004913330078125, + "loss": 0.0084, + "step": 4398, + "total_loss": 0.004913330078125 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001844589342843738, + "lm_loss": 0.00640869140625, + "loss": 0.0073, + "step": 4399, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001844520568309553, + "lm_loss": 0.008544921875, + "loss": 0.008, + "step": 4400, + "total_loss": 0.008544921875 + }, + { + "epoch": 1.8, + "eval_lm_loss": 0.00976422056555748, + "eval_loss": 0.010152596980333328, + "eval_runtime": 43.9624, + "eval_samples_per_second": 22.747, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.00976422056555748, + "lm_loss": 0.000942230224609375, + "step": 4400, + "total_loss": 0.000942230224609375 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001844451779843894, + "lm_loss": 0.006683349609375, + "loss": 0.0067, + "step": 4401, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018443829774478956, + "lm_loss": 0.008056640625, + "loss": 0.006, + "step": 4402, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018443141611226927, + "lm_loss": 0.01416015625, + "loss": 0.0076, + "step": 4403, + "total_loss": 0.01416015625 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001844245330869421, + "lm_loss": 0.00628662109375, + "loss": 0.0065, + "step": 4404, + "total_loss": 0.00628662109375 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018441764866892156, + "lm_loss": 0.00701904296875, + "loss": 0.0094, + "step": 4405, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001844107628583212, + "lm_loss": 0.005096435546875, + "loss": 0.0062, + "step": 4406, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018440387565525464, + "lm_loss": 0.005645751953125, + "loss": 0.0069, + "step": 4407, + "total_loss": 0.005645751953125 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001843969870598355, + "lm_loss": 0.00823974609375, + "loss": 0.0075, + "step": 4408, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018439009707217738, + "lm_loss": 0.004241943359375, + "loss": 0.0081, + "step": 4409, + "total_loss": 0.004241943359375 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018438320569239398, + "lm_loss": 0.01177978515625, + "loss": 0.0089, + "step": 4410, + "total_loss": 0.01177978515625 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018437631292059895, + "lm_loss": 0.01055908203125, + "loss": 0.008, + "step": 4411, + "total_loss": 0.01055908203125 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018436941875690603, + "lm_loss": 0.00093841552734375, + "loss": 0.0092, + "step": 4412, + "total_loss": 0.00093841552734375 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001843625232014289, + "lm_loss": 0.0074462890625, + "loss": 0.0093, + "step": 4413, + "total_loss": 0.0074462890625 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018435562625428138, + "lm_loss": 0.004913330078125, + "loss": 0.0059, + "step": 4414, + "total_loss": 0.004913330078125 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018434872791557716, + "lm_loss": 0.00872802734375, + "loss": 0.0083, + "step": 4415, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018434182818543012, + "lm_loss": 0.00775146484375, + "loss": 0.0069, + "step": 4416, + "total_loss": 0.00775146484375 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018433492706395398, + "lm_loss": 0.00872802734375, + "loss": 0.0068, + "step": 4417, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018432802455126268, + "lm_loss": 0.009765625, + "loss": 0.0076, + "step": 4418, + "total_loss": 0.009765625 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018432112064747005, + "lm_loss": 0.005340576171875, + "loss": 0.0091, + "step": 4419, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018431421535268998, + "lm_loss": 0.01348876953125, + "loss": 0.0062, + "step": 4420, + "total_loss": 0.01348876953125 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018430730866703638, + "lm_loss": 0.0101318359375, + "loss": 0.0095, + "step": 4421, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018430040059062318, + "lm_loss": 0.01214599609375, + "loss": 0.0076, + "step": 4422, + "total_loss": 0.01214599609375 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018429349112356432, + "lm_loss": 0.001953125, + "loss": 0.0069, + "step": 4423, + "total_loss": 0.001953125 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018428658026597387, + "lm_loss": 0.0076904296875, + "loss": 0.0085, + "step": 4424, + "total_loss": 0.0076904296875 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001842796680179657, + "lm_loss": 0.007415771484375, + "loss": 0.0092, + "step": 4425, + "total_loss": 0.007415771484375 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001842727543796539, + "lm_loss": 0.004180908203125, + "loss": 0.0076, + "step": 4426, + "total_loss": 0.004180908203125 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018426583935115255, + "lm_loss": 0.0089111328125, + "loss": 0.0059, + "step": 4427, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018425892293257568, + "lm_loss": 0.003204345703125, + "loss": 0.0061, + "step": 4428, + "total_loss": 0.003204345703125 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018425200512403744, + "lm_loss": 0.011474609375, + "loss": 0.0083, + "step": 4429, + "total_loss": 0.011474609375 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018424508592565188, + "lm_loss": 0.0089111328125, + "loss": 0.0069, + "step": 4430, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018423816533753317, + "lm_loss": 0.004486083984375, + "loss": 0.0076, + "step": 4431, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018423124335979548, + "lm_loss": 0.00775146484375, + "loss": 0.008, + "step": 4432, + "total_loss": 0.00775146484375 + }, + { + "epoch": 1.81, + "learning_rate": 0.000184224319992553, + "lm_loss": 0.009033203125, + "loss": 0.007, + "step": 4433, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018421739523591992, + "lm_loss": 0.002044677734375, + "loss": 0.0077, + "step": 4434, + "total_loss": 0.002044677734375 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018421046909001046, + "lm_loss": 0.0021514892578125, + "loss": 0.0073, + "step": 4435, + "total_loss": 0.0021514892578125 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018420354155493894, + "lm_loss": 0.00445556640625, + "loss": 0.0063, + "step": 4436, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001841966126308196, + "lm_loss": 0.0101318359375, + "loss": 0.0086, + "step": 4437, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001841896823177667, + "lm_loss": 0.006927490234375, + "loss": 0.0074, + "step": 4438, + "total_loss": 0.006927490234375 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018418275061589466, + "lm_loss": 0.0096435546875, + "loss": 0.0066, + "step": 4439, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018417581752531775, + "lm_loss": 0.00457763671875, + "loss": 0.0071, + "step": 4440, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018416888304615035, + "lm_loss": 0.00579833984375, + "loss": 0.0084, + "step": 4441, + "total_loss": 0.00579833984375 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018416194717850687, + "lm_loss": 0.007720947265625, + "loss": 0.0088, + "step": 4442, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018415500992250173, + "lm_loss": 0.0185546875, + "loss": 0.0076, + "step": 4443, + "total_loss": 0.0185546875 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018414807127824936, + "lm_loss": 0.003936767578125, + "loss": 0.0066, + "step": 4444, + "total_loss": 0.003936767578125 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018414113124586422, + "lm_loss": 0.00592041015625, + "loss": 0.0068, + "step": 4445, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001841341898254608, + "lm_loss": 0.014404296875, + "loss": 0.0073, + "step": 4446, + "total_loss": 0.014404296875 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001841272470171536, + "lm_loss": 0.00408935546875, + "loss": 0.0048, + "step": 4447, + "total_loss": 0.00408935546875 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018412030282105718, + "lm_loss": 0.006439208984375, + "loss": 0.0059, + "step": 4448, + "total_loss": 0.006439208984375 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018411335723728608, + "lm_loss": 0.003692626953125, + "loss": 0.0065, + "step": 4449, + "total_loss": 0.003692626953125 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018410641026595482, + "lm_loss": 0.0074462890625, + "loss": 0.0084, + "step": 4450, + "total_loss": 0.0074462890625 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018409946190717807, + "lm_loss": 0.00616455078125, + "loss": 0.007, + "step": 4451, + "total_loss": 0.00616455078125 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018409251216107046, + "lm_loss": 0.00787353515625, + "loss": 0.0066, + "step": 4452, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018408556102774657, + "lm_loss": 0.00518798828125, + "loss": 0.0066, + "step": 4453, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018407860850732112, + "lm_loss": 0.006317138671875, + "loss": 0.009, + "step": 4454, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018407165459990876, + "lm_loss": 0.00799560546875, + "loss": 0.007, + "step": 4455, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018406469930562426, + "lm_loss": 0.006805419921875, + "loss": 0.008, + "step": 4456, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018405774262458227, + "lm_loss": 0.00439453125, + "loss": 0.0063, + "step": 4457, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001840507845568977, + "lm_loss": 0.00628662109375, + "loss": 0.007, + "step": 4458, + "total_loss": 0.00628662109375 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018404382510268515, + "lm_loss": 0.01116943359375, + "loss": 0.0076, + "step": 4459, + "total_loss": 0.01116943359375 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018403686426205954, + "lm_loss": 0.00543212890625, + "loss": 0.0078, + "step": 4460, + "total_loss": 0.00543212890625 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018402990203513566, + "lm_loss": 0.004913330078125, + "loss": 0.0067, + "step": 4461, + "total_loss": 0.004913330078125 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018402293842202837, + "lm_loss": 0.002899169921875, + "loss": 0.0058, + "step": 4462, + "total_loss": 0.002899169921875 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018401597342285254, + "lm_loss": 0.003936767578125, + "loss": 0.0071, + "step": 4463, + "total_loss": 0.003936767578125 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001840090070377231, + "lm_loss": 0.01263427734375, + "loss": 0.0068, + "step": 4464, + "total_loss": 0.01263427734375 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018400203926675492, + "lm_loss": 0.007232666015625, + "loss": 0.008, + "step": 4465, + "total_loss": 0.007232666015625 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018399507011006297, + "lm_loss": 0.01483154296875, + "loss": 0.0079, + "step": 4466, + "total_loss": 0.01483154296875 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001839880995677622, + "lm_loss": 0.00555419921875, + "loss": 0.0069, + "step": 4467, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018398112763996763, + "lm_loss": 0.0067138671875, + "loss": 0.0067, + "step": 4468, + "total_loss": 0.0067138671875 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018397415432679423, + "lm_loss": 0.006805419921875, + "loss": 0.0077, + "step": 4469, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018396717962835705, + "lm_loss": 0.006134033203125, + "loss": 0.0084, + "step": 4470, + "total_loss": 0.006134033203125 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018396020354477117, + "lm_loss": 0.004119873046875, + "loss": 0.0073, + "step": 4471, + "total_loss": 0.004119873046875 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001839532260761516, + "lm_loss": 0.0067138671875, + "loss": 0.0059, + "step": 4472, + "total_loss": 0.0067138671875 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018394624722261355, + "lm_loss": 0.00677490234375, + "loss": 0.0057, + "step": 4473, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018393926698427205, + "lm_loss": 0.00665283203125, + "loss": 0.0081, + "step": 4474, + "total_loss": 0.00665283203125 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018393228536124233, + "lm_loss": 0.006591796875, + "loss": 0.0061, + "step": 4475, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018392530235363945, + "lm_loss": 0.0064697265625, + "loss": 0.0077, + "step": 4476, + "total_loss": 0.0064697265625 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018391831796157873, + "lm_loss": 0.006072998046875, + "loss": 0.0081, + "step": 4477, + "total_loss": 0.006072998046875 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001839113321851753, + "lm_loss": 0.00299072265625, + "loss": 0.0078, + "step": 4478, + "total_loss": 0.00299072265625 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018390434502454442, + "lm_loss": 0.0028228759765625, + "loss": 0.0083, + "step": 4479, + "total_loss": 0.0028228759765625 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018389735647980136, + "lm_loss": 0.00872802734375, + "loss": 0.0064, + "step": 4480, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001838903665510614, + "lm_loss": 0.004486083984375, + "loss": 0.0067, + "step": 4481, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018388337523843985, + "lm_loss": 0.00787353515625, + "loss": 0.0076, + "step": 4482, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018387638254205207, + "lm_loss": 0.0125732421875, + "loss": 0.0064, + "step": 4483, + "total_loss": 0.0125732421875 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018386938846201337, + "lm_loss": 0.00897216796875, + "loss": 0.0072, + "step": 4484, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018386239299843913, + "lm_loss": 0.00518798828125, + "loss": 0.0052, + "step": 4485, + "total_loss": 0.00518798828125 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018385539615144476, + "lm_loss": 0.00701904296875, + "loss": 0.0063, + "step": 4486, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001838483979211457, + "lm_loss": 0.00482177734375, + "loss": 0.0077, + "step": 4487, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018384139830765735, + "lm_loss": 0.126953125, + "loss": 0.0167, + "step": 4488, + "total_loss": 0.126953125 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018383439731109525, + "lm_loss": 0.01153564453125, + "loss": 0.0091, + "step": 4489, + "total_loss": 0.01153564453125 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018382739493157484, + "lm_loss": 0.01153564453125, + "loss": 0.0066, + "step": 4490, + "total_loss": 0.01153564453125 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018382039116921162, + "lm_loss": 0.00567626953125, + "loss": 0.0068, + "step": 4491, + "total_loss": 0.00567626953125 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018381338602412115, + "lm_loss": 0.008056640625, + "loss": 0.0074, + "step": 4492, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018380637949641903, + "lm_loss": 0.00787353515625, + "loss": 0.0079, + "step": 4493, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018379937158622073, + "lm_loss": 0.00823974609375, + "loss": 0.0073, + "step": 4494, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018379236229364195, + "lm_loss": 0.0185546875, + "loss": 0.0087, + "step": 4495, + "total_loss": 0.0185546875 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018378535161879832, + "lm_loss": 0.00390625, + "loss": 0.0063, + "step": 4496, + "total_loss": 0.00390625 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001837783395618054, + "lm_loss": 0.0047607421875, + "loss": 0.0062, + "step": 4497, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018377132612277896, + "lm_loss": 0.004486083984375, + "loss": 0.0058, + "step": 4498, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018376431130183468, + "lm_loss": 0.0031585693359375, + "loss": 0.0067, + "step": 4499, + "total_loss": 0.0031585693359375 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018375729509908823, + "lm_loss": 0.00701904296875, + "loss": 0.0067, + "step": 4500, + "total_loss": 0.00701904296875 + }, + { + "epoch": 1.84, + "eval_lm_loss": 0.009700492955744267, + "eval_loss": 0.010171051137149334, + "eval_runtime": 44.0137, + "eval_samples_per_second": 22.72, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.009700492955744267, + "lm_loss": 0.00225830078125, + "step": 4500, + "total_loss": 0.00225830078125 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001837502775146554, + "lm_loss": 0.01019287109375, + "loss": 0.0087, + "step": 4501, + "total_loss": 0.01019287109375 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001837432585486519, + "lm_loss": 0.00933837890625, + "loss": 0.0073, + "step": 4502, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018373623820119357, + "lm_loss": 0.00848388671875, + "loss": 0.0063, + "step": 4503, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018372921647239618, + "lm_loss": 0.00982666015625, + "loss": 0.0098, + "step": 4504, + "total_loss": 0.00982666015625 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001837221933623756, + "lm_loss": 0.007415771484375, + "loss": 0.0088, + "step": 4505, + "total_loss": 0.007415771484375 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001837151688712477, + "lm_loss": 0.006439208984375, + "loss": 0.0082, + "step": 4506, + "total_loss": 0.006439208984375 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001837081429991283, + "lm_loss": 0.006622314453125, + "loss": 0.0072, + "step": 4507, + "total_loss": 0.006622314453125 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001837011157461333, + "lm_loss": 0.00921630859375, + "loss": 0.0081, + "step": 4508, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001836940871123787, + "lm_loss": 0.01104736328125, + "loss": 0.007, + "step": 4509, + "total_loss": 0.01104736328125 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018368705709798037, + "lm_loss": 0.00653076171875, + "loss": 0.0071, + "step": 4510, + "total_loss": 0.00653076171875 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018368002570305433, + "lm_loss": 0.007171630859375, + "loss": 0.0067, + "step": 4511, + "total_loss": 0.007171630859375 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018367299292771651, + "lm_loss": 0.01129150390625, + "loss": 0.0072, + "step": 4512, + "total_loss": 0.01129150390625 + }, + { + "epoch": 1.85, + "learning_rate": 0.000183665958772083, + "lm_loss": 0.006378173828125, + "loss": 0.0059, + "step": 4513, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001836589232362698, + "lm_loss": 0.0076904296875, + "loss": 0.0063, + "step": 4514, + "total_loss": 0.0076904296875 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018365188632039295, + "lm_loss": 0.0118408203125, + "loss": 0.0067, + "step": 4515, + "total_loss": 0.0118408203125 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001836448480245686, + "lm_loss": 0.002227783203125, + "loss": 0.007, + "step": 4516, + "total_loss": 0.002227783203125 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018363780834891277, + "lm_loss": 0.00823974609375, + "loss": 0.0087, + "step": 4517, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018363076729354164, + "lm_loss": 0.00445556640625, + "loss": 0.0083, + "step": 4518, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018362372485857133, + "lm_loss": 0.003173828125, + "loss": 0.0064, + "step": 4519, + "total_loss": 0.003173828125 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018361668104411807, + "lm_loss": 0.00909423828125, + "loss": 0.0083, + "step": 4520, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018360963585029803, + "lm_loss": 0.0032196044921875, + "loss": 0.0068, + "step": 4521, + "total_loss": 0.0032196044921875 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018360258927722738, + "lm_loss": 0.00732421875, + "loss": 0.0071, + "step": 4522, + "total_loss": 0.00732421875 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018359554132502243, + "lm_loss": 0.006317138671875, + "loss": 0.0078, + "step": 4523, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018358849199379942, + "lm_loss": 0.0142822265625, + "loss": 0.0078, + "step": 4524, + "total_loss": 0.0142822265625 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018358144128367463, + "lm_loss": 0.005584716796875, + "loss": 0.0063, + "step": 4525, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018357438919476438, + "lm_loss": 0.0064697265625, + "loss": 0.007, + "step": 4526, + "total_loss": 0.0064697265625 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018356733572718502, + "lm_loss": 0.009521484375, + "loss": 0.0063, + "step": 4527, + "total_loss": 0.009521484375 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018356028088105286, + "lm_loss": 0.0125732421875, + "loss": 0.0093, + "step": 4528, + "total_loss": 0.0125732421875 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018355322465648434, + "lm_loss": 0.01239013671875, + "loss": 0.0089, + "step": 4529, + "total_loss": 0.01239013671875 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018354616705359582, + "lm_loss": 0.006805419921875, + "loss": 0.0065, + "step": 4530, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018353910807250372, + "lm_loss": 0.01025390625, + "loss": 0.0055, + "step": 4531, + "total_loss": 0.01025390625 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001835320477133245, + "lm_loss": 0.01422119140625, + "loss": 0.0072, + "step": 4532, + "total_loss": 0.01422119140625 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018352498597617463, + "lm_loss": 0.0050048828125, + "loss": 0.0066, + "step": 4533, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018351792286117063, + "lm_loss": 0.005218505859375, + "loss": 0.0054, + "step": 4534, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018351085836842897, + "lm_loss": 0.0106201171875, + "loss": 0.0077, + "step": 4535, + "total_loss": 0.0106201171875 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001835037924980662, + "lm_loss": 0.0024566650390625, + "loss": 0.0073, + "step": 4536, + "total_loss": 0.0024566650390625 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001834967252501989, + "lm_loss": 0.002777099609375, + "loss": 0.007, + "step": 4537, + "total_loss": 0.002777099609375 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018348965662494363, + "lm_loss": 0.005828857421875, + "loss": 0.0065, + "step": 4538, + "total_loss": 0.005828857421875 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018348258662241701, + "lm_loss": 0.003997802734375, + "loss": 0.0083, + "step": 4539, + "total_loss": 0.003997802734375 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018347551524273566, + "lm_loss": 0.0010833740234375, + "loss": 0.009, + "step": 4540, + "total_loss": 0.0010833740234375 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018346844248601625, + "lm_loss": 0.007598876953125, + "loss": 0.0061, + "step": 4541, + "total_loss": 0.007598876953125 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018346136835237543, + "lm_loss": 0.005767822265625, + "loss": 0.0072, + "step": 4542, + "total_loss": 0.005767822265625 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018345429284192993, + "lm_loss": 0.004486083984375, + "loss": 0.0059, + "step": 4543, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018344721595479644, + "lm_loss": 0.021484375, + "loss": 0.0078, + "step": 4544, + "total_loss": 0.021484375 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001834401376910917, + "lm_loss": 0.0069580078125, + "loss": 0.0081, + "step": 4545, + "total_loss": 0.0069580078125 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001834330580509325, + "lm_loss": 0.005126953125, + "loss": 0.0059, + "step": 4546, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018342597703443562, + "lm_loss": 0.004791259765625, + "loss": 0.007, + "step": 4547, + "total_loss": 0.004791259765625 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018341889464171783, + "lm_loss": 0.00872802734375, + "loss": 0.0071, + "step": 4548, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018341181087289603, + "lm_loss": 0.00732421875, + "loss": 0.0071, + "step": 4549, + "total_loss": 0.00732421875 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018340472572808705, + "lm_loss": 0.006988525390625, + "loss": 0.0073, + "step": 4550, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018339763920740773, + "lm_loss": 0.006683349609375, + "loss": 0.0066, + "step": 4551, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018339055131097505, + "lm_loss": 0.0084228515625, + "loss": 0.0077, + "step": 4552, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018338346203890585, + "lm_loss": 0.01214599609375, + "loss": 0.008, + "step": 4553, + "total_loss": 0.01214599609375 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018337637139131713, + "lm_loss": 0.006500244140625, + "loss": 0.0068, + "step": 4554, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018336927936832585, + "lm_loss": 0.006378173828125, + "loss": 0.0077, + "step": 4555, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.86, + "learning_rate": 0.000183362185970049, + "lm_loss": 0.01043701171875, + "loss": 0.008, + "step": 4556, + "total_loss": 0.01043701171875 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018335509119660358, + "lm_loss": 0.008056640625, + "loss": 0.0072, + "step": 4557, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018334799504810665, + "lm_loss": 0.00323486328125, + "loss": 0.0078, + "step": 4558, + "total_loss": 0.00323486328125 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018334089752467526, + "lm_loss": 0.00933837890625, + "loss": 0.0073, + "step": 4559, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001833337986264265, + "lm_loss": 0.00244140625, + "loss": 0.0053, + "step": 4560, + "total_loss": 0.00244140625 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018332669835347746, + "lm_loss": 0.00433349609375, + "loss": 0.0071, + "step": 4561, + "total_loss": 0.00433349609375 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001833195967059453, + "lm_loss": 0.0162353515625, + "loss": 0.0065, + "step": 4562, + "total_loss": 0.0162353515625 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001833124936839471, + "lm_loss": 0.01068115234375, + "loss": 0.0065, + "step": 4563, + "total_loss": 0.01068115234375 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018330538928760013, + "lm_loss": 0.0089111328125, + "loss": 0.0061, + "step": 4564, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018329828351702152, + "lm_loss": 0.0078125, + "loss": 0.0074, + "step": 4565, + "total_loss": 0.0078125 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018329117637232848, + "lm_loss": 0.0050048828125, + "loss": 0.0079, + "step": 4566, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018328406785363832, + "lm_loss": 0.007568359375, + "loss": 0.0081, + "step": 4567, + "total_loss": 0.007568359375 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018327695796106826, + "lm_loss": 0.0087890625, + "loss": 0.0058, + "step": 4568, + "total_loss": 0.0087890625 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018326984669473557, + "lm_loss": 0.004638671875, + "loss": 0.0066, + "step": 4569, + "total_loss": 0.004638671875 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001832627340547576, + "lm_loss": 0.0040283203125, + "loss": 0.0069, + "step": 4570, + "total_loss": 0.0040283203125 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018325562004125164, + "lm_loss": 0.0096435546875, + "loss": 0.0056, + "step": 4571, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018324850465433506, + "lm_loss": 0.004913330078125, + "loss": 0.0066, + "step": 4572, + "total_loss": 0.004913330078125 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018324138789412527, + "lm_loss": 0.00823974609375, + "loss": 0.007, + "step": 4573, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018323426976073965, + "lm_loss": 0.01348876953125, + "loss": 0.0082, + "step": 4574, + "total_loss": 0.01348876953125 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018322715025429563, + "lm_loss": 0.0054931640625, + "loss": 0.0064, + "step": 4575, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001832200293749106, + "lm_loss": 0.005096435546875, + "loss": 0.0074, + "step": 4576, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001832129071227021, + "lm_loss": 0.00811767578125, + "loss": 0.0069, + "step": 4577, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001832057834977876, + "lm_loss": 0.00439453125, + "loss": 0.006, + "step": 4578, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018319865850028458, + "lm_loss": 0.011474609375, + "loss": 0.0074, + "step": 4579, + "total_loss": 0.011474609375 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018319153213031063, + "lm_loss": 0.0028839111328125, + "loss": 0.0078, + "step": 4580, + "total_loss": 0.0028839111328125 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018318440438798325, + "lm_loss": 0.0059814453125, + "loss": 0.0069, + "step": 4581, + "total_loss": 0.0059814453125 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018317727527342008, + "lm_loss": 0.01080322265625, + "loss": 0.0084, + "step": 4582, + "total_loss": 0.01080322265625 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001831701447867387, + "lm_loss": 0.006072998046875, + "loss": 0.0073, + "step": 4583, + "total_loss": 0.006072998046875 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018316301292805668, + "lm_loss": 0.004913330078125, + "loss": 0.0069, + "step": 4584, + "total_loss": 0.004913330078125 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001831558796974918, + "lm_loss": 0.0068359375, + "loss": 0.0073, + "step": 4585, + "total_loss": 0.0068359375 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018314874509516162, + "lm_loss": 0.00537109375, + "loss": 0.0065, + "step": 4586, + "total_loss": 0.00537109375 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018314160912118386, + "lm_loss": 0.00469970703125, + "loss": 0.0077, + "step": 4587, + "total_loss": 0.00469970703125 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018313447177567628, + "lm_loss": 0.005584716796875, + "loss": 0.0072, + "step": 4588, + "total_loss": 0.005584716796875 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018312733305875654, + "lm_loss": 0.0089111328125, + "loss": 0.0091, + "step": 4589, + "total_loss": 0.0089111328125 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018312019297054248, + "lm_loss": 0.006317138671875, + "loss": 0.0072, + "step": 4590, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018311305151115184, + "lm_loss": 0.004791259765625, + "loss": 0.006, + "step": 4591, + "total_loss": 0.004791259765625 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018310590868070246, + "lm_loss": 0.00469970703125, + "loss": 0.0071, + "step": 4592, + "total_loss": 0.00469970703125 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018309876447931213, + "lm_loss": 0.007781982421875, + "loss": 0.0059, + "step": 4593, + "total_loss": 0.007781982421875 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018309161890709877, + "lm_loss": 0.01190185546875, + "loss": 0.008, + "step": 4594, + "total_loss": 0.01190185546875 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018308447196418014, + "lm_loss": 0.00299072265625, + "loss": 0.0057, + "step": 4595, + "total_loss": 0.00299072265625 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018307732365067426, + "lm_loss": 0.005340576171875, + "loss": 0.0077, + "step": 4596, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018307017396669898, + "lm_loss": 0.00445556640625, + "loss": 0.0073, + "step": 4597, + "total_loss": 0.00445556640625 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001830630229123723, + "lm_loss": 0.005889892578125, + "loss": 0.0068, + "step": 4598, + "total_loss": 0.005889892578125 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018305587048781212, + "lm_loss": 0.00811767578125, + "loss": 0.0061, + "step": 4599, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018304871669313645, + "lm_loss": 0.00885009765625, + "loss": 0.0084, + "step": 4600, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.88, + "eval_lm_loss": 0.009746807627379894, + "eval_loss": 0.01018200907856226, + "eval_runtime": 44.1121, + "eval_samples_per_second": 22.67, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.009746807627379894, + "lm_loss": 0.00109100341796875, + "step": 4600, + "total_loss": 0.00109100341796875 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018304156152846332, + "lm_loss": 0.00604248046875, + "loss": 0.0052, + "step": 4601, + "total_loss": 0.00604248046875 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018303440499391075, + "lm_loss": 0.004974365234375, + "loss": 0.0058, + "step": 4602, + "total_loss": 0.004974365234375 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001830272470895968, + "lm_loss": 0.006622314453125, + "loss": 0.0087, + "step": 4603, + "total_loss": 0.006622314453125 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018302008781563957, + "lm_loss": 0.01300048828125, + "loss": 0.0081, + "step": 4604, + "total_loss": 0.01300048828125 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001830129271721571, + "lm_loss": 0.0036163330078125, + "loss": 0.0082, + "step": 4605, + "total_loss": 0.0036163330078125 + }, + { + "epoch": 1.88, + "learning_rate": 0.0001830057651592676, + "lm_loss": 0.012451171875, + "loss": 0.0073, + "step": 4606, + "total_loss": 0.012451171875 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018299860177708915, + "lm_loss": 0.007415771484375, + "loss": 0.0065, + "step": 4607, + "total_loss": 0.007415771484375 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018299143702573992, + "lm_loss": 0.00421142578125, + "loss": 0.0077, + "step": 4608, + "total_loss": 0.00421142578125 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018298427090533814, + "lm_loss": 0.01202392578125, + "loss": 0.0068, + "step": 4609, + "total_loss": 0.01202392578125 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018297710341600202, + "lm_loss": 0.0106201171875, + "loss": 0.007, + "step": 4610, + "total_loss": 0.0106201171875 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018296993455784976, + "lm_loss": 0.016845703125, + "loss": 0.0068, + "step": 4611, + "total_loss": 0.016845703125 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018296276433099967, + "lm_loss": 0.006988525390625, + "loss": 0.0062, + "step": 4612, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018295559273557, + "lm_loss": 0.0103759765625, + "loss": 0.009, + "step": 4613, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018294841977167906, + "lm_loss": 0.0036163330078125, + "loss": 0.0071, + "step": 4614, + "total_loss": 0.0036163330078125 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018294124543944516, + "lm_loss": 0.0084228515625, + "loss": 0.0064, + "step": 4615, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001829340697389867, + "lm_loss": 0.00885009765625, + "loss": 0.0076, + "step": 4616, + "total_loss": 0.00885009765625 + }, + { + "epoch": 1.89, + "learning_rate": 0.000182926892670422, + "lm_loss": 0.00958251953125, + "loss": 0.0076, + "step": 4617, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018291971423386945, + "lm_loss": 0.00811767578125, + "loss": 0.0072, + "step": 4618, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018291253442944753, + "lm_loss": 0.006378173828125, + "loss": 0.0076, + "step": 4619, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018290535325727464, + "lm_loss": 0.00762939453125, + "loss": 0.0068, + "step": 4620, + "total_loss": 0.00762939453125 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018289817071746924, + "lm_loss": 0.003326416015625, + "loss": 0.0069, + "step": 4621, + "total_loss": 0.003326416015625 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018289098681014982, + "lm_loss": 0.009521484375, + "loss": 0.0047, + "step": 4622, + "total_loss": 0.009521484375 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018288380153543493, + "lm_loss": 0.01397705078125, + "loss": 0.0067, + "step": 4623, + "total_loss": 0.01397705078125 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018287661489344302, + "lm_loss": 0.005218505859375, + "loss": 0.0061, + "step": 4624, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018286942688429267, + "lm_loss": 0.007080078125, + "loss": 0.0078, + "step": 4625, + "total_loss": 0.007080078125 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001828622375081025, + "lm_loss": 0.00811767578125, + "loss": 0.0091, + "step": 4626, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018285504676499107, + "lm_loss": 0.0022125244140625, + "loss": 0.0049, + "step": 4627, + "total_loss": 0.0022125244140625 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018284785465507702, + "lm_loss": 0.0098876953125, + "loss": 0.0063, + "step": 4628, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018284066117847897, + "lm_loss": 0.006256103515625, + "loss": 0.005, + "step": 4629, + "total_loss": 0.006256103515625 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001828334663353156, + "lm_loss": 0.004608154296875, + "loss": 0.0062, + "step": 4630, + "total_loss": 0.004608154296875 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001828262701257056, + "lm_loss": 0.006591796875, + "loss": 0.0071, + "step": 4631, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018281907254976768, + "lm_loss": 0.00897216796875, + "loss": 0.0093, + "step": 4632, + "total_loss": 0.00897216796875 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018281187360762058, + "lm_loss": 0.0106201171875, + "loss": 0.0085, + "step": 4633, + "total_loss": 0.0106201171875 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018280467329938302, + "lm_loss": 0.0150146484375, + "loss": 0.0091, + "step": 4634, + "total_loss": 0.0150146484375 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018279747162517382, + "lm_loss": 0.00592041015625, + "loss": 0.0069, + "step": 4635, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018279026858511176, + "lm_loss": 0.0074462890625, + "loss": 0.0068, + "step": 4636, + "total_loss": 0.0074462890625 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018278306417931567, + "lm_loss": 0.0113525390625, + "loss": 0.0075, + "step": 4637, + "total_loss": 0.0113525390625 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018277585840790442, + "lm_loss": 0.01312255859375, + "loss": 0.0068, + "step": 4638, + "total_loss": 0.01312255859375 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018276865127099683, + "lm_loss": 0.008056640625, + "loss": 0.0075, + "step": 4639, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001827614427687118, + "lm_loss": 0.005218505859375, + "loss": 0.0061, + "step": 4640, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001827542329011683, + "lm_loss": 0.00689697265625, + "loss": 0.0098, + "step": 4641, + "total_loss": 0.00689697265625 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018274702166848518, + "lm_loss": 0.01019287109375, + "loss": 0.006, + "step": 4642, + "total_loss": 0.01019287109375 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018273980907078147, + "lm_loss": 0.00830078125, + "loss": 0.0075, + "step": 4643, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018273259510817616, + "lm_loss": 0.0059814453125, + "loss": 0.0079, + "step": 4644, + "total_loss": 0.0059814453125 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018272537978078815, + "lm_loss": 0.004638671875, + "loss": 0.0055, + "step": 4645, + "total_loss": 0.004638671875 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001827181630887366, + "lm_loss": 0.00836181640625, + "loss": 0.0066, + "step": 4646, + "total_loss": 0.00836181640625 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018271094503214042, + "lm_loss": 0.00909423828125, + "loss": 0.0066, + "step": 4647, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018270372561111884, + "lm_loss": 0.0103759765625, + "loss": 0.0073, + "step": 4648, + "total_loss": 0.0103759765625 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018269650482579083, + "lm_loss": 0.007232666015625, + "loss": 0.0057, + "step": 4649, + "total_loss": 0.007232666015625 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018268928267627555, + "lm_loss": 0.011474609375, + "loss": 0.0096, + "step": 4650, + "total_loss": 0.011474609375 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018268205916269212, + "lm_loss": 0.01251220703125, + "loss": 0.0073, + "step": 4651, + "total_loss": 0.01251220703125 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018267483428515972, + "lm_loss": 0.0042724609375, + "loss": 0.0081, + "step": 4652, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018266760804379754, + "lm_loss": 0.0064697265625, + "loss": 0.006, + "step": 4653, + "total_loss": 0.0064697265625 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018266038043872478, + "lm_loss": 0.0030975341796875, + "loss": 0.0071, + "step": 4654, + "total_loss": 0.0030975341796875 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018265315147006066, + "lm_loss": 0.005767822265625, + "loss": 0.0078, + "step": 4655, + "total_loss": 0.005767822265625 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018264592113792442, + "lm_loss": 0.00457763671875, + "loss": 0.0064, + "step": 4656, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018263868944243537, + "lm_loss": 0.0022430419921875, + "loss": 0.0077, + "step": 4657, + "total_loss": 0.0022430419921875 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001826314563837128, + "lm_loss": 0.00970458984375, + "loss": 0.0079, + "step": 4658, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018262422196187598, + "lm_loss": 0.01031494140625, + "loss": 0.0062, + "step": 4659, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018261698617704432, + "lm_loss": 0.0035400390625, + "loss": 0.0062, + "step": 4660, + "total_loss": 0.0035400390625 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018260974902933712, + "lm_loss": 0.00384521484375, + "loss": 0.0065, + "step": 4661, + "total_loss": 0.00384521484375 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018260251051887384, + "lm_loss": 0.0027618408203125, + "loss": 0.0063, + "step": 4662, + "total_loss": 0.0027618408203125 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001825952706457738, + "lm_loss": 0.01190185546875, + "loss": 0.007, + "step": 4663, + "total_loss": 0.01190185546875 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001825880294101565, + "lm_loss": 0.0084228515625, + "loss": 0.0087, + "step": 4664, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018258078681214135, + "lm_loss": 0.00823974609375, + "loss": 0.008, + "step": 4665, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018257354285184787, + "lm_loss": 0.005828857421875, + "loss": 0.0062, + "step": 4666, + "total_loss": 0.005828857421875 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018256629752939552, + "lm_loss": 0.010498046875, + "loss": 0.0069, + "step": 4667, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018255905084490383, + "lm_loss": 0.00994873046875, + "loss": 0.0071, + "step": 4668, + "total_loss": 0.00994873046875 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018255180279849235, + "lm_loss": 0.00689697265625, + "loss": 0.0071, + "step": 4669, + "total_loss": 0.00689697265625 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018254455339028068, + "lm_loss": 0.0096435546875, + "loss": 0.007, + "step": 4670, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018253730262038836, + "lm_loss": 0.01068115234375, + "loss": 0.0077, + "step": 4671, + "total_loss": 0.01068115234375 + }, + { + "epoch": 1.91, + "learning_rate": 0.000182530050488935, + "lm_loss": 0.0048828125, + "loss": 0.0071, + "step": 4672, + "total_loss": 0.0048828125 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018252279699604025, + "lm_loss": 0.00946044921875, + "loss": 0.0073, + "step": 4673, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018251554214182377, + "lm_loss": 0.0034942626953125, + "loss": 0.0066, + "step": 4674, + "total_loss": 0.0034942626953125 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001825082859264052, + "lm_loss": 0.00665283203125, + "loss": 0.0076, + "step": 4675, + "total_loss": 0.00665283203125 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018250102834990433, + "lm_loss": 0.004974365234375, + "loss": 0.0078, + "step": 4676, + "total_loss": 0.004974365234375 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001824937694124408, + "lm_loss": 0.0072021484375, + "loss": 0.006, + "step": 4677, + "total_loss": 0.0072021484375 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018248650911413438, + "lm_loss": 0.0045166015625, + "loss": 0.0061, + "step": 4678, + "total_loss": 0.0045166015625 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018247924745510484, + "lm_loss": 0.00154876708984375, + "loss": 0.0073, + "step": 4679, + "total_loss": 0.00154876708984375 + }, + { + "epoch": 1.91, + "learning_rate": 0.000182471984435472, + "lm_loss": 0.005859375, + "loss": 0.0072, + "step": 4680, + "total_loss": 0.005859375 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018246472005535558, + "lm_loss": 0.01055908203125, + "loss": 0.0089, + "step": 4681, + "total_loss": 0.01055908203125 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018245745431487556, + "lm_loss": 0.007537841796875, + "loss": 0.0065, + "step": 4682, + "total_loss": 0.007537841796875 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018245018721415167, + "lm_loss": 0.005096435546875, + "loss": 0.0071, + "step": 4683, + "total_loss": 0.005096435546875 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018244291875330385, + "lm_loss": 0.0081787109375, + "loss": 0.0071, + "step": 4684, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018243564893245196, + "lm_loss": 0.006744384765625, + "loss": 0.007, + "step": 4685, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.92, + "learning_rate": 0.000182428377751716, + "lm_loss": 0.0135498046875, + "loss": 0.008, + "step": 4686, + "total_loss": 0.0135498046875 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018242110521121585, + "lm_loss": 0.003265380859375, + "loss": 0.0073, + "step": 4687, + "total_loss": 0.003265380859375 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001824138313110715, + "lm_loss": 0.005615234375, + "loss": 0.0086, + "step": 4688, + "total_loss": 0.005615234375 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018240655605140295, + "lm_loss": 0.00482177734375, + "loss": 0.0069, + "step": 4689, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018239927943233025, + "lm_loss": 0.00946044921875, + "loss": 0.0071, + "step": 4690, + "total_loss": 0.00946044921875 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018239200145397333, + "lm_loss": 0.00982666015625, + "loss": 0.007, + "step": 4691, + "total_loss": 0.00982666015625 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018238472211645233, + "lm_loss": 0.0042724609375, + "loss": 0.0069, + "step": 4692, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018237744141988738, + "lm_loss": 0.0059814453125, + "loss": 0.006, + "step": 4693, + "total_loss": 0.0059814453125 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001823701593643985, + "lm_loss": 0.00958251953125, + "loss": 0.0079, + "step": 4694, + "total_loss": 0.00958251953125 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001823628759501058, + "lm_loss": 0.00787353515625, + "loss": 0.0055, + "step": 4695, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018235559117712953, + "lm_loss": 0.003448486328125, + "loss": 0.0059, + "step": 4696, + "total_loss": 0.003448486328125 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018234830504558976, + "lm_loss": 0.00604248046875, + "loss": 0.0079, + "step": 4697, + "total_loss": 0.00604248046875 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018234101755560674, + "lm_loss": 0.0034332275390625, + "loss": 0.0074, + "step": 4698, + "total_loss": 0.0034332275390625 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018233372870730068, + "lm_loss": 0.00933837890625, + "loss": 0.0065, + "step": 4699, + "total_loss": 0.00933837890625 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018232643850079182, + "lm_loss": 0.00372314453125, + "loss": 0.0071, + "step": 4700, + "total_loss": 0.00372314453125 + }, + { + "epoch": 1.92, + "eval_lm_loss": 0.009319985285401344, + "eval_loss": 0.009750260971486568, + "eval_runtime": 44.1299, + "eval_samples_per_second": 22.66, + "eval_steps_per_second": 0.204, + "eval_total_loss": 0.009319985285401344, + "lm_loss": 0.00115203857421875, + "step": 4700, + "total_loss": 0.00115203857421875 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001823191469362004, + "lm_loss": 0.0069580078125, + "loss": 0.0072, + "step": 4701, + "total_loss": 0.0069580078125 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018231185401364674, + "lm_loss": 0.00439453125, + "loss": 0.0056, + "step": 4702, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018230455973325112, + "lm_loss": 0.006378173828125, + "loss": 0.0056, + "step": 4703, + "total_loss": 0.006378173828125 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018229726409513388, + "lm_loss": 0.016357421875, + "loss": 0.0085, + "step": 4704, + "total_loss": 0.016357421875 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018228996709941534, + "lm_loss": 0.01416015625, + "loss": 0.0089, + "step": 4705, + "total_loss": 0.01416015625 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001822826687462159, + "lm_loss": 0.003997802734375, + "loss": 0.0065, + "step": 4706, + "total_loss": 0.003997802734375 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018227536903565596, + "lm_loss": 0.005035400390625, + "loss": 0.0076, + "step": 4707, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018226806796785595, + "lm_loss": 0.00579833984375, + "loss": 0.0057, + "step": 4708, + "total_loss": 0.00579833984375 + }, + { + "epoch": 1.93, + "learning_rate": 0.0001822607655429363, + "lm_loss": 0.007354736328125, + "loss": 0.0073, + "step": 4709, + "total_loss": 0.007354736328125 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018225346176101742, + "lm_loss": 0.006683349609375, + "loss": 0.0075, + "step": 4710, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.93, + "learning_rate": 0.0001822461566222199, + "lm_loss": 0.006072998046875, + "loss": 0.0067, + "step": 4711, + "total_loss": 0.006072998046875 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018223885012666415, + "lm_loss": 0.0072021484375, + "loss": 0.008, + "step": 4712, + "total_loss": 0.0072021484375 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018223154227447075, + "lm_loss": 0.00677490234375, + "loss": 0.0058, + "step": 4713, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018222423306576025, + "lm_loss": 0.005401611328125, + "loss": 0.0054, + "step": 4714, + "total_loss": 0.005401611328125 + }, + { + "epoch": 1.93, + "learning_rate": 0.0001822169225006532, + "lm_loss": 0.01275634765625, + "loss": 0.009, + "step": 4715, + "total_loss": 0.01275634765625 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018220961057927024, + "lm_loss": 0.00482177734375, + "loss": 0.006, + "step": 4716, + "total_loss": 0.00482177734375 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018220229730173194, + "lm_loss": 0.00347900390625, + "loss": 0.008, + "step": 4717, + "total_loss": 0.00347900390625 + }, + { + "epoch": 1.93, + "learning_rate": 0.000182194982668159, + "lm_loss": 0.004425048828125, + "loss": 0.0059, + "step": 4718, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.93, + "learning_rate": 0.000182187666678672, + "lm_loss": 0.0030975341796875, + "loss": 0.0071, + "step": 4719, + "total_loss": 0.0030975341796875 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018218034933339174, + "lm_loss": 0.0038604736328125, + "loss": 0.0079, + "step": 4720, + "total_loss": 0.0038604736328125 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018217303063243882, + "lm_loss": 0.01287841796875, + "loss": 0.0077, + "step": 4721, + "total_loss": 0.01287841796875 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018216571057593406, + "lm_loss": 0.0081787109375, + "loss": 0.0066, + "step": 4722, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018215838916399815, + "lm_loss": 0.00970458984375, + "loss": 0.0069, + "step": 4723, + "total_loss": 0.00970458984375 + }, + { + "epoch": 1.93, + "learning_rate": 0.0001821510663967519, + "lm_loss": 0.0052490234375, + "loss": 0.0069, + "step": 4724, + "total_loss": 0.0052490234375 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018214374227431605, + "lm_loss": 0.01092529296875, + "loss": 0.0071, + "step": 4725, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018213641679681153, + "lm_loss": 0.0167236328125, + "loss": 0.0091, + "step": 4726, + "total_loss": 0.0167236328125 + }, + { + "epoch": 1.93, + "learning_rate": 0.0001821290899643591, + "lm_loss": 0.004364013671875, + "loss": 0.0058, + "step": 4727, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018212176177707964, + "lm_loss": 0.0037994384765625, + "loss": 0.0066, + "step": 4728, + "total_loss": 0.0037994384765625 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018211443223509406, + "lm_loss": 0.01531982421875, + "loss": 0.0095, + "step": 4729, + "total_loss": 0.01531982421875 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018210710133852325, + "lm_loss": 0.0166015625, + "loss": 0.0079, + "step": 4730, + "total_loss": 0.0166015625 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018209976908748813, + "lm_loss": 0.005157470703125, + "loss": 0.0071, + "step": 4731, + "total_loss": 0.005157470703125 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018209243548210973, + "lm_loss": 0.006744384765625, + "loss": 0.0068, + "step": 4732, + "total_loss": 0.006744384765625 + }, + { + "epoch": 1.93, + "learning_rate": 0.0001820851005225089, + "lm_loss": 0.01092529296875, + "loss": 0.0085, + "step": 4733, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018207776420880675, + "lm_loss": 0.006622314453125, + "loss": 0.0078, + "step": 4734, + "total_loss": 0.006622314453125 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018207042654112428, + "lm_loss": 0.005828857421875, + "loss": 0.0058, + "step": 4735, + "total_loss": 0.005828857421875 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018206308751958248, + "lm_loss": 0.0034027099609375, + "loss": 0.0071, + "step": 4736, + "total_loss": 0.0034027099609375 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018205574714430246, + "lm_loss": 0.00811767578125, + "loss": 0.0057, + "step": 4737, + "total_loss": 0.00811767578125 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018204840541540532, + "lm_loss": 0.006683349609375, + "loss": 0.0095, + "step": 4738, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018204106233301218, + "lm_loss": 0.0098876953125, + "loss": 0.0082, + "step": 4739, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018203371789724413, + "lm_loss": 0.00982666015625, + "loss": 0.0076, + "step": 4740, + "total_loss": 0.00982666015625 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001820263721082223, + "lm_loss": 0.002655029296875, + "loss": 0.008, + "step": 4741, + "total_loss": 0.002655029296875 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018201902496606798, + "lm_loss": 0.00830078125, + "loss": 0.0056, + "step": 4742, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018201167647090226, + "lm_loss": 0.01031494140625, + "loss": 0.007, + "step": 4743, + "total_loss": 0.01031494140625 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018200432662284643, + "lm_loss": 0.00439453125, + "loss": 0.0061, + "step": 4744, + "total_loss": 0.00439453125 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001819969754220217, + "lm_loss": 0.0028228759765625, + "loss": 0.0065, + "step": 4745, + "total_loss": 0.0028228759765625 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018198962286854938, + "lm_loss": 0.005218505859375, + "loss": 0.0077, + "step": 4746, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018198226896255068, + "lm_loss": 0.00250244140625, + "loss": 0.0071, + "step": 4747, + "total_loss": 0.00250244140625 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018197491370414701, + "lm_loss": 0.004486083984375, + "loss": 0.0072, + "step": 4748, + "total_loss": 0.004486083984375 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018196755709345966, + "lm_loss": 0.00457763671875, + "loss": 0.0064, + "step": 4749, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018196019913060995, + "lm_loss": 0.004364013671875, + "loss": 0.0074, + "step": 4750, + "total_loss": 0.004364013671875 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001819528398157193, + "lm_loss": 0.006988525390625, + "loss": 0.0083, + "step": 4751, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018194547914890911, + "lm_loss": 0.0098876953125, + "loss": 0.0067, + "step": 4752, + "total_loss": 0.0098876953125 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018193811713030083, + "lm_loss": 0.004425048828125, + "loss": 0.0093, + "step": 4753, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018193075376001582, + "lm_loss": 0.0091552734375, + "loss": 0.0079, + "step": 4754, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018192338903817566, + "lm_loss": 0.00830078125, + "loss": 0.0072, + "step": 4755, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018191602296490172, + "lm_loss": 0.006988525390625, + "loss": 0.0069, + "step": 4756, + "total_loss": 0.006988525390625 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018190865554031563, + "lm_loss": 0.0084228515625, + "loss": 0.0082, + "step": 4757, + "total_loss": 0.0084228515625 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018190128676453886, + "lm_loss": 0.005035400390625, + "loss": 0.0066, + "step": 4758, + "total_loss": 0.005035400390625 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018189391663769297, + "lm_loss": 0.0025787353515625, + "loss": 0.0065, + "step": 4759, + "total_loss": 0.0025787353515625 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018188654515989956, + "lm_loss": 0.006134033203125, + "loss": 0.0066, + "step": 4760, + "total_loss": 0.006134033203125 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001818791723312802, + "lm_loss": 0.00604248046875, + "loss": 0.0069, + "step": 4761, + "total_loss": 0.00604248046875 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018187179815195656, + "lm_loss": 0.0113525390625, + "loss": 0.0068, + "step": 4762, + "total_loss": 0.0113525390625 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001818644226220503, + "lm_loss": 0.0032196044921875, + "loss": 0.0067, + "step": 4763, + "total_loss": 0.0032196044921875 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018185704574168301, + "lm_loss": 0.005706787109375, + "loss": 0.008, + "step": 4764, + "total_loss": 0.005706787109375 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018184966751097644, + "lm_loss": 0.009033203125, + "loss": 0.0074, + "step": 4765, + "total_loss": 0.009033203125 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018184228793005224, + "lm_loss": 0.00640869140625, + "loss": 0.0066, + "step": 4766, + "total_loss": 0.00640869140625 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018183490699903224, + "lm_loss": 0.00872802734375, + "loss": 0.0078, + "step": 4767, + "total_loss": 0.00872802734375 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018182752471803814, + "lm_loss": 0.00848388671875, + "loss": 0.0076, + "step": 4768, + "total_loss": 0.00848388671875 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018182014108719175, + "lm_loss": 0.004791259765625, + "loss": 0.0071, + "step": 4769, + "total_loss": 0.004791259765625 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018181275610661485, + "lm_loss": 0.004058837890625, + "loss": 0.0081, + "step": 4770, + "total_loss": 0.004058837890625 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018180536977642923, + "lm_loss": 0.006683349609375, + "loss": 0.0054, + "step": 4771, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001817979820967568, + "lm_loss": 0.00738525390625, + "loss": 0.007, + "step": 4772, + "total_loss": 0.00738525390625 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018179059306771943, + "lm_loss": 0.003173828125, + "loss": 0.0059, + "step": 4773, + "total_loss": 0.003173828125 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018178320268943897, + "lm_loss": 0.00457763671875, + "loss": 0.0072, + "step": 4774, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018177581096203736, + "lm_loss": 0.01092529296875, + "loss": 0.0073, + "step": 4775, + "total_loss": 0.01092529296875 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018176841788563653, + "lm_loss": 0.00909423828125, + "loss": 0.006, + "step": 4776, + "total_loss": 0.00909423828125 + }, + { + "epoch": 1.95, + "learning_rate": 0.0001817610234603584, + "lm_loss": 0.012939453125, + "loss": 0.0072, + "step": 4777, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018175362768632503, + "lm_loss": 0.008056640625, + "loss": 0.0072, + "step": 4778, + "total_loss": 0.008056640625 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018174623056365835, + "lm_loss": 0.0057373046875, + "loss": 0.0069, + "step": 4779, + "total_loss": 0.0057373046875 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018173883209248045, + "lm_loss": 0.004241943359375, + "loss": 0.0077, + "step": 4780, + "total_loss": 0.004241943359375 + }, + { + "epoch": 1.95, + "learning_rate": 0.00018173143227291332, + "lm_loss": 0.006317138671875, + "loss": 0.0072, + "step": 4781, + "total_loss": 0.006317138671875 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018172403110507904, + "lm_loss": 0.0035247802734375, + "loss": 0.0072, + "step": 4782, + "total_loss": 0.0035247802734375 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018171662858909975, + "lm_loss": 0.00567626953125, + "loss": 0.0064, + "step": 4783, + "total_loss": 0.00567626953125 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001817092247250975, + "lm_loss": 0.00592041015625, + "loss": 0.009, + "step": 4784, + "total_loss": 0.00592041015625 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001817018195131945, + "lm_loss": 0.00421142578125, + "loss": 0.0054, + "step": 4785, + "total_loss": 0.00421142578125 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018169441295351281, + "lm_loss": 0.00775146484375, + "loss": 0.0083, + "step": 4786, + "total_loss": 0.00775146484375 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001816870050461747, + "lm_loss": 0.010498046875, + "loss": 0.0065, + "step": 4787, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018167959579130233, + "lm_loss": 0.0096435546875, + "loss": 0.0084, + "step": 4788, + "total_loss": 0.0096435546875 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018167218518901792, + "lm_loss": 0.00762939453125, + "loss": 0.0067, + "step": 4789, + "total_loss": 0.00762939453125 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018166477323944375, + "lm_loss": 0.002410888671875, + "loss": 0.0063, + "step": 4790, + "total_loss": 0.002410888671875 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018165735994270205, + "lm_loss": 0.01007080078125, + "loss": 0.0076, + "step": 4791, + "total_loss": 0.01007080078125 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018164994529891514, + "lm_loss": 0.00543212890625, + "loss": 0.0077, + "step": 4792, + "total_loss": 0.00543212890625 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018164252930820535, + "lm_loss": 0.00494384765625, + "loss": 0.0066, + "step": 4793, + "total_loss": 0.00494384765625 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018163511197069498, + "lm_loss": 0.00408935546875, + "loss": 0.0075, + "step": 4794, + "total_loss": 0.00408935546875 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018162769328650642, + "lm_loss": 0.0050048828125, + "loss": 0.007, + "step": 4795, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.96, + "learning_rate": 0.000181620273255762, + "lm_loss": 0.00799560546875, + "loss": 0.0064, + "step": 4796, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001816128518785842, + "lm_loss": 0.01116943359375, + "loss": 0.0088, + "step": 4797, + "total_loss": 0.01116943359375 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018160542915509535, + "lm_loss": 0.004974365234375, + "loss": 0.0057, + "step": 4798, + "total_loss": 0.004974365234375 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018159800508541797, + "lm_loss": 0.00823974609375, + "loss": 0.0065, + "step": 4799, + "total_loss": 0.00823974609375 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018159057966967449, + "lm_loss": 0.0027008056640625, + "loss": 0.0088, + "step": 4800, + "total_loss": 0.0027008056640625 + }, + { + "epoch": 1.96, + "eval_lm_loss": 0.009422020055353642, + "eval_loss": 0.009859314188361168, + "eval_runtime": 43.9688, + "eval_samples_per_second": 22.743, + "eval_steps_per_second": 0.205, + "eval_total_loss": 0.009422020055353642, + "lm_loss": 0.00168609619140625, + "step": 4800, + "total_loss": 0.00168609619140625 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018158315290798746, + "lm_loss": 0.006866455078125, + "loss": 0.0064, + "step": 4801, + "total_loss": 0.006866455078125 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018157572480047936, + "lm_loss": 0.006134033203125, + "loss": 0.0091, + "step": 4802, + "total_loss": 0.006134033203125 + }, + { + "epoch": 1.96, + "learning_rate": 0.0001815682953472727, + "lm_loss": 0.01275634765625, + "loss": 0.0063, + "step": 4803, + "total_loss": 0.01275634765625 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018156086454849005, + "lm_loss": 0.0042724609375, + "loss": 0.0076, + "step": 4804, + "total_loss": 0.0042724609375 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018155343240425402, + "lm_loss": 0.00555419921875, + "loss": 0.0067, + "step": 4805, + "total_loss": 0.00555419921875 + }, + { + "epoch": 1.96, + "learning_rate": 0.00018154599891468718, + "lm_loss": 0.00457763671875, + "loss": 0.0073, + "step": 4806, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001815385640799122, + "lm_loss": 0.00921630859375, + "loss": 0.0082, + "step": 4807, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018153112790005165, + "lm_loss": 0.006805419921875, + "loss": 0.0078, + "step": 4808, + "total_loss": 0.006805419921875 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001815236903752283, + "lm_loss": 0.0037994384765625, + "loss": 0.0081, + "step": 4809, + "total_loss": 0.0037994384765625 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018151625150556475, + "lm_loss": 0.00616455078125, + "loss": 0.0083, + "step": 4810, + "total_loss": 0.00616455078125 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018150881129118377, + "lm_loss": 0.00732421875, + "loss": 0.0063, + "step": 4811, + "total_loss": 0.00732421875 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018150136973220805, + "lm_loss": 0.00677490234375, + "loss": 0.0079, + "step": 4812, + "total_loss": 0.00677490234375 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001814939268287604, + "lm_loss": 0.0081787109375, + "loss": 0.0074, + "step": 4813, + "total_loss": 0.0081787109375 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018148648258096357, + "lm_loss": 0.0050048828125, + "loss": 0.0072, + "step": 4814, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018147903698894038, + "lm_loss": 0.005889892578125, + "loss": 0.0092, + "step": 4815, + "total_loss": 0.005889892578125 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018147159005281363, + "lm_loss": 0.01239013671875, + "loss": 0.008, + "step": 4816, + "total_loss": 0.01239013671875 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018146414177270617, + "lm_loss": 0.0054931640625, + "loss": 0.0085, + "step": 4817, + "total_loss": 0.0054931640625 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018145669214874092, + "lm_loss": 0.0108642578125, + "loss": 0.0077, + "step": 4818, + "total_loss": 0.0108642578125 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018144924118104072, + "lm_loss": 0.01055908203125, + "loss": 0.0082, + "step": 4819, + "total_loss": 0.01055908203125 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018144178886972848, + "lm_loss": 0.00579833984375, + "loss": 0.0079, + "step": 4820, + "total_loss": 0.00579833984375 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018143433521492717, + "lm_loss": 0.00506591796875, + "loss": 0.007, + "step": 4821, + "total_loss": 0.00506591796875 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018142688021675972, + "lm_loss": 0.005126953125, + "loss": 0.0081, + "step": 4822, + "total_loss": 0.005126953125 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001814194238753491, + "lm_loss": 0.00921630859375, + "loss": 0.0102, + "step": 4823, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018141196619081833, + "lm_loss": 0.005950927734375, + "loss": 0.0073, + "step": 4824, + "total_loss": 0.005950927734375 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018140450716329045, + "lm_loss": 0.0079345703125, + "loss": 0.0075, + "step": 4825, + "total_loss": 0.0079345703125 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001813970467928885, + "lm_loss": 0.006500244140625, + "loss": 0.0059, + "step": 4826, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018138958507973554, + "lm_loss": 0.0047607421875, + "loss": 0.0062, + "step": 4827, + "total_loss": 0.0047607421875 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001813821220239546, + "lm_loss": 0.0101318359375, + "loss": 0.0074, + "step": 4828, + "total_loss": 0.0101318359375 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018137465762566894, + "lm_loss": 0.002685546875, + "loss": 0.0061, + "step": 4829, + "total_loss": 0.002685546875 + }, + { + "epoch": 1.97, + "learning_rate": 0.00018136719188500158, + "lm_loss": 0.0067138671875, + "loss": 0.0072, + "step": 4830, + "total_loss": 0.0067138671875 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001813597248020757, + "lm_loss": 0.010498046875, + "loss": 0.0082, + "step": 4831, + "total_loss": 0.010498046875 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018135225637701447, + "lm_loss": 0.010986328125, + "loss": 0.0066, + "step": 4832, + "total_loss": 0.010986328125 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018134478660994114, + "lm_loss": 0.006683349609375, + "loss": 0.0047, + "step": 4833, + "total_loss": 0.006683349609375 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018133731550097886, + "lm_loss": 0.005218505859375, + "loss": 0.0065, + "step": 4834, + "total_loss": 0.005218505859375 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018132984305025093, + "lm_loss": 0.01171875, + "loss": 0.0077, + "step": 4835, + "total_loss": 0.01171875 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018132236925788063, + "lm_loss": 0.006591796875, + "loss": 0.008, + "step": 4836, + "total_loss": 0.006591796875 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018131489412399117, + "lm_loss": 0.007720947265625, + "loss": 0.0054, + "step": 4837, + "total_loss": 0.007720947265625 + }, + { + "epoch": 1.98, + "learning_rate": 0.000181307417648706, + "lm_loss": 0.006500244140625, + "loss": 0.0074, + "step": 4838, + "total_loss": 0.006500244140625 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001812999398321483, + "lm_loss": 0.0032501220703125, + "loss": 0.0058, + "step": 4839, + "total_loss": 0.0032501220703125 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018129246067444152, + "lm_loss": 0.00787353515625, + "loss": 0.0077, + "step": 4840, + "total_loss": 0.00787353515625 + }, + { + "epoch": 1.98, + "learning_rate": 0.000181284980175709, + "lm_loss": 0.0162353515625, + "loss": 0.0092, + "step": 4841, + "total_loss": 0.0162353515625 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018127749833607418, + "lm_loss": 0.0050048828125, + "loss": 0.0085, + "step": 4842, + "total_loss": 0.0050048828125 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018127001515566048, + "lm_loss": 0.0024566650390625, + "loss": 0.0065, + "step": 4843, + "total_loss": 0.0024566650390625 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018126253063459127, + "lm_loss": 0.00830078125, + "loss": 0.0086, + "step": 4844, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018125504477299007, + "lm_loss": 0.0142822265625, + "loss": 0.0076, + "step": 4845, + "total_loss": 0.0142822265625 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001812475575709804, + "lm_loss": 0.007049560546875, + "loss": 0.0067, + "step": 4846, + "total_loss": 0.007049560546875 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018124006902868573, + "lm_loss": 0.00634765625, + "loss": 0.0082, + "step": 4847, + "total_loss": 0.00634765625 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018123257914622962, + "lm_loss": 0.00506591796875, + "loss": 0.0066, + "step": 4848, + "total_loss": 0.00506591796875 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001812250879237356, + "lm_loss": 0.004730224609375, + "loss": 0.0084, + "step": 4849, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018121759536132722, + "lm_loss": 0.01025390625, + "loss": 0.0079, + "step": 4850, + "total_loss": 0.01025390625 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001812101014591282, + "lm_loss": 0.005340576171875, + "loss": 0.0078, + "step": 4851, + "total_loss": 0.005340576171875 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018120260621726198, + "lm_loss": 0.0027618408203125, + "loss": 0.007, + "step": 4852, + "total_loss": 0.0027618408203125 + }, + { + "epoch": 1.98, + "learning_rate": 0.0001811951096358524, + "lm_loss": 0.004425048828125, + "loss": 0.0052, + "step": 4853, + "total_loss": 0.004425048828125 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018118761171502295, + "lm_loss": 0.0118408203125, + "loss": 0.0078, + "step": 4854, + "total_loss": 0.0118408203125 + }, + { + "epoch": 1.98, + "learning_rate": 0.00018118011245489744, + "lm_loss": 0.0030517578125, + "loss": 0.0072, + "step": 4855, + "total_loss": 0.0030517578125 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001811726118555995, + "lm_loss": 0.012939453125, + "loss": 0.0082, + "step": 4856, + "total_loss": 0.012939453125 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018116510991725293, + "lm_loss": 0.00457763671875, + "loss": 0.006, + "step": 4857, + "total_loss": 0.00457763671875 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018115760663998143, + "lm_loss": 0.0091552734375, + "loss": 0.0084, + "step": 4858, + "total_loss": 0.0091552734375 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001811501020239088, + "lm_loss": 0.006011962890625, + "loss": 0.0049, + "step": 4859, + "total_loss": 0.006011962890625 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018114259606915887, + "lm_loss": 0.0120849609375, + "loss": 0.0083, + "step": 4860, + "total_loss": 0.0120849609375 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001811350887758554, + "lm_loss": 0.00830078125, + "loss": 0.0084, + "step": 4861, + "total_loss": 0.00830078125 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018112758014412228, + "lm_loss": 0.007232666015625, + "loss": 0.0084, + "step": 4862, + "total_loss": 0.007232666015625 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018112007017408334, + "lm_loss": 0.00537109375, + "loss": 0.0064, + "step": 4863, + "total_loss": 0.00537109375 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018111255886586248, + "lm_loss": 0.004669189453125, + "loss": 0.006, + "step": 4864, + "total_loss": 0.004669189453125 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018110504621958362, + "lm_loss": 0.01171875, + "loss": 0.0079, + "step": 4865, + "total_loss": 0.01171875 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001810975322353707, + "lm_loss": 0.00689697265625, + "loss": 0.0076, + "step": 4866, + "total_loss": 0.00689697265625 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001810900169133476, + "lm_loss": 0.0032806396484375, + "loss": 0.0069, + "step": 4867, + "total_loss": 0.0032806396484375 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001810825002536384, + "lm_loss": 0.0029754638671875, + "loss": 0.0061, + "step": 4868, + "total_loss": 0.0029754638671875 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018107498225636704, + "lm_loss": 0.00799560546875, + "loss": 0.0075, + "step": 4869, + "total_loss": 0.00799560546875 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018106746292165756, + "lm_loss": 0.004150390625, + "loss": 0.0052, + "step": 4870, + "total_loss": 0.004150390625 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018105994224963396, + "lm_loss": 0.004730224609375, + "loss": 0.007, + "step": 4871, + "total_loss": 0.004730224609375 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018105242024042033, + "lm_loss": 0.0069580078125, + "loss": 0.0068, + "step": 4872, + "total_loss": 0.0069580078125 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018104489689414077, + "lm_loss": 0.003448486328125, + "loss": 0.0076, + "step": 4873, + "total_loss": 0.003448486328125 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018103737221091936, + "lm_loss": 0.00921630859375, + "loss": 0.0076, + "step": 4874, + "total_loss": 0.00921630859375 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018102984619088026, + "lm_loss": 0.007171630859375, + "loss": 0.0079, + "step": 4875, + "total_loss": 0.007171630859375 + }, + { + "epoch": 1.99, + "learning_rate": 0.0001810223188341476, + "lm_loss": 0.01019287109375, + "loss": 0.0067, + "step": 4876, + "total_loss": 0.01019287109375 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018101479014084556, + "lm_loss": 0.01025390625, + "loss": 0.0076, + "step": 4877, + "total_loss": 0.01025390625 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018100726011109833, + "lm_loss": 0.009765625, + "loss": 0.0086, + "step": 4878, + "total_loss": 0.009765625 + }, + { + "epoch": 1.99, + "learning_rate": 0.00018099972874503016, + "lm_loss": 0.0244140625, + "loss": 0.0085, + "step": 4879, + "total_loss": 0.0244140625 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018099219604276524, + "lm_loss": 0.004730224609375, + "loss": 0.0071, + "step": 4880, + "total_loss": 0.004730224609375 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018098466200442784, + "lm_loss": 0.0027618408203125, + "loss": 0.0069, + "step": 4881, + "total_loss": 0.0027618408203125 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018097712663014227, + "lm_loss": 0.00701904296875, + "loss": 0.0065, + "step": 4882, + "total_loss": 0.00701904296875 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001809695899200328, + "lm_loss": 0.004547119140625, + "loss": 0.007, + "step": 4883, + "total_loss": 0.004547119140625 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018096205187422384, + "lm_loss": 0.01165771484375, + "loss": 0.0091, + "step": 4884, + "total_loss": 0.01165771484375 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018095451249283963, + "lm_loss": 0.017822265625, + "loss": 0.0084, + "step": 4885, + "total_loss": 0.017822265625 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001809469717760046, + "lm_loss": 0.0038604736328125, + "loss": 0.0073, + "step": 4886, + "total_loss": 0.0038604736328125 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018093942972384318, + "lm_loss": 0.01171875, + "loss": 0.0065, + "step": 4887, + "total_loss": 0.01171875 + }, + { + "epoch": 2.0, + "learning_rate": 0.0001809318863364797, + "lm_loss": 0.0079345703125, + "loss": 0.0066, + "step": 4888, + "total_loss": 0.0079345703125 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018092434161403866, + "lm_loss": 0.014404296875, + "loss": 0.0069, + "step": 4889, + "total_loss": 0.014404296875 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018091679555664447, + "lm_loss": 0.01458740234375, + "loss": 0.0062, + "step": 4890, + "total_loss": 0.01458740234375 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018090924816442167, + "lm_loss": 0.005615234375, + "loss": 0.0064, + "step": 4891, + "total_loss": 0.005615234375 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018090169943749476, + "lm_loss": 0.00145721435546875, + "loss": 0.0066, + "step": 4892, + "total_loss": 0.00145721435546875 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018089414937598822, + "lm_loss": 0.01080322265625, + "loss": 0.0078, + "step": 4893, + "total_loss": 0.01080322265625 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018088659798002664, + "lm_loss": 0.007049560546875, + "loss": 0.0059, + "step": 4894, + "total_loss": 0.007049560546875 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018087904524973454, + "lm_loss": 0.00933837890625, + "loss": 0.0078, + "step": 4895, + "total_loss": 0.00933837890625 + }, + { + "epoch": 2.0, + "learning_rate": 0.00018087149118523654, + "lm_loss": 0.010986328125, + "loss": 0.0065, + "step": 4896, + "total_loss": 0.010986328125 + } + ], + "max_steps": 24460, + "num_train_epochs": 10, + "total_flos": 4.56740603808113e+19, + "trial_name": null, + "trial_params": null +}