{ "best_metric": 1.0328505039215088, "best_model_checkpoint": "/kaggle/output/checkpoint-42000", "epoch": 1.7519556714471969, "eval_steps": 1000, "global_step": 43000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.7777777777777777e-11, "loss": 1.2184, "step": 1 }, { "epoch": 0.04, "learning_rate": 2.7750000000000004e-08, "loss": 1.1394, "step": 1000 }, { "epoch": 0.04, "eval_accuracy": 0.3327345309381238, "eval_loss": 1.1149410009384155, "eval_runtime": 20.6803, "eval_samples_per_second": 242.26, "eval_steps_per_second": 30.319, "step": 1000 }, { "epoch": 0.08, "learning_rate": 5.5527777777777784e-08, "loss": 1.1141, "step": 2000 }, { "epoch": 0.08, "eval_accuracy": 0.3401197604790419, "eval_loss": 1.104099988937378, "eval_runtime": 20.8477, "eval_samples_per_second": 240.314, "eval_steps_per_second": 30.075, "step": 2000 }, { "epoch": 0.12, "learning_rate": 8.330555555555556e-08, "loss": 1.116, "step": 3000 }, { "epoch": 0.12, "eval_accuracy": 0.3407185628742515, "eval_loss": 1.1040862798690796, "eval_runtime": 20.6818, "eval_samples_per_second": 242.242, "eval_steps_per_second": 30.317, "step": 3000 }, { "epoch": 0.16, "learning_rate": 1.1108333333333333e-07, "loss": 1.1158, "step": 4000 }, { "epoch": 0.16, "eval_accuracy": 0.32894211576846305, "eval_loss": 1.1020556688308716, "eval_runtime": 20.8541, "eval_samples_per_second": 240.241, "eval_steps_per_second": 30.066, "step": 4000 }, { "epoch": 0.2, "learning_rate": 1.3883333333333335e-07, "loss": 1.1135, "step": 5000 }, { "epoch": 0.2, "eval_accuracy": 0.34271457085828344, "eval_loss": 1.1008552312850952, "eval_runtime": 20.8055, "eval_samples_per_second": 240.802, "eval_steps_per_second": 30.136, "step": 5000 }, { "epoch": 0.24, "learning_rate": 1.6658333333333335e-07, "loss": 1.1121, "step": 6000 }, { "epoch": 0.24, "eval_accuracy": 0.3395209580838323, "eval_loss": 1.1004050970077515, "eval_runtime": 20.8985, "eval_samples_per_second": 239.731, "eval_steps_per_second": 30.002, "step": 6000 }, { "epoch": 0.29, "learning_rate": 1.9436111111111112e-07, "loss": 1.1089, "step": 7000 }, { "epoch": 0.29, "eval_accuracy": 0.35788423153692617, "eval_loss": 1.0985721349716187, "eval_runtime": 20.84, "eval_samples_per_second": 240.403, "eval_steps_per_second": 30.086, "step": 7000 }, { "epoch": 0.33, "learning_rate": 2.2213888888888891e-07, "loss": 1.1079, "step": 8000 }, { "epoch": 0.33, "eval_accuracy": 0.3331337325349301, "eval_loss": 1.098374843597412, "eval_runtime": 20.7886, "eval_samples_per_second": 240.998, "eval_steps_per_second": 30.161, "step": 8000 }, { "epoch": 0.37, "learning_rate": 2.4988888888888893e-07, "loss": 1.1087, "step": 9000 }, { "epoch": 0.37, "eval_accuracy": 0.34510978043912177, "eval_loss": 1.0993521213531494, "eval_runtime": 20.782, "eval_samples_per_second": 241.074, "eval_steps_per_second": 30.17, "step": 9000 }, { "epoch": 0.41, "learning_rate": 2.776666666666667e-07, "loss": 1.109, "step": 10000 }, { "epoch": 0.41, "eval_accuracy": 0.3475049900199601, "eval_loss": 1.0967597961425781, "eval_runtime": 20.6798, "eval_samples_per_second": 242.265, "eval_steps_per_second": 30.319, "step": 10000 }, { "epoch": 0.45, "learning_rate": 3.054444444444444e-07, "loss": 1.1052, "step": 11000 }, { "epoch": 0.45, "eval_accuracy": 0.37544910179640717, "eval_loss": 1.0941349267959595, "eval_runtime": 20.8641, "eval_samples_per_second": 240.126, "eval_steps_per_second": 30.052, "step": 11000 }, { "epoch": 0.49, "learning_rate": 3.3322222222222225e-07, "loss": 1.105, "step": 12000 }, { "epoch": 0.49, "eval_accuracy": 0.3834331337325349, "eval_loss": 1.0927647352218628, "eval_runtime": 20.6541, "eval_samples_per_second": 242.567, "eval_steps_per_second": 30.357, "step": 12000 }, { "epoch": 0.53, "learning_rate": 3.609722222222222e-07, "loss": 1.1016, "step": 13000 }, { "epoch": 0.53, "eval_accuracy": 0.3457085828343313, "eval_loss": 1.0942081212997437, "eval_runtime": 21.0733, "eval_samples_per_second": 237.742, "eval_steps_per_second": 29.753, "step": 13000 }, { "epoch": 0.57, "learning_rate": 3.8875e-07, "loss": 1.1031, "step": 14000 }, { "epoch": 0.57, "eval_accuracy": 0.37005988023952097, "eval_loss": 1.0918152332305908, "eval_runtime": 20.9151, "eval_samples_per_second": 239.54, "eval_steps_per_second": 29.978, "step": 14000 }, { "epoch": 0.61, "learning_rate": 4.1652777777777786e-07, "loss": 1.1026, "step": 15000 }, { "epoch": 0.61, "eval_accuracy": 0.3790419161676647, "eval_loss": 1.0895211696624756, "eval_runtime": 21.0591, "eval_samples_per_second": 237.902, "eval_steps_per_second": 29.773, "step": 15000 }, { "epoch": 0.65, "learning_rate": 4.4427777777777783e-07, "loss": 1.0988, "step": 16000 }, { "epoch": 0.65, "eval_accuracy": 0.4101796407185629, "eval_loss": 1.0852997303009033, "eval_runtime": 20.9509, "eval_samples_per_second": 239.131, "eval_steps_per_second": 29.927, "step": 16000 }, { "epoch": 0.69, "learning_rate": 4.720555555555556e-07, "loss": 1.0974, "step": 17000 }, { "epoch": 0.69, "eval_accuracy": 0.43213572854291415, "eval_loss": 1.0791982412338257, "eval_runtime": 20.7526, "eval_samples_per_second": 241.415, "eval_steps_per_second": 30.213, "step": 17000 }, { "epoch": 0.73, "learning_rate": 4.998055555555556e-07, "loss": 1.0932, "step": 18000 }, { "epoch": 0.73, "eval_accuracy": 0.4275449101796407, "eval_loss": 1.072191596031189, "eval_runtime": 21.2435, "eval_samples_per_second": 235.837, "eval_steps_per_second": 29.515, "step": 18000 }, { "epoch": 0.77, "learning_rate": 5.275833333333334e-07, "loss": 1.0833, "step": 19000 }, { "epoch": 0.77, "eval_accuracy": 0.43233532934131735, "eval_loss": 1.06425940990448, "eval_runtime": 20.7923, "eval_samples_per_second": 240.955, "eval_steps_per_second": 30.155, "step": 19000 }, { "epoch": 0.81, "learning_rate": 5.553333333333334e-07, "loss": 1.0787, "step": 20000 }, { "epoch": 0.81, "eval_accuracy": 0.4295409181636727, "eval_loss": 1.0638529062271118, "eval_runtime": 21.0018, "eval_samples_per_second": 238.551, "eval_steps_per_second": 29.855, "step": 20000 }, { "epoch": 0.86, "learning_rate": 5.831111111111111e-07, "loss": 1.0779, "step": 21000 }, { "epoch": 0.86, "eval_accuracy": 0.4243512974051896, "eval_loss": 1.0603673458099365, "eval_runtime": 20.9689, "eval_samples_per_second": 238.926, "eval_steps_per_second": 29.901, "step": 21000 }, { "epoch": 0.9, "learning_rate": 6.108888888888888e-07, "loss": 1.0751, "step": 22000 }, { "epoch": 0.9, "eval_accuracy": 0.43233532934131735, "eval_loss": 1.0603009462356567, "eval_runtime": 20.8897, "eval_samples_per_second": 239.831, "eval_steps_per_second": 30.015, "step": 22000 }, { "epoch": 0.94, "learning_rate": 6.386388888888889e-07, "loss": 1.0776, "step": 23000 }, { "epoch": 0.94, "eval_accuracy": 0.42734530938123755, "eval_loss": 1.0591468811035156, "eval_runtime": 20.964, "eval_samples_per_second": 238.981, "eval_steps_per_second": 29.908, "step": 23000 }, { "epoch": 0.98, "learning_rate": 6.664166666666667e-07, "loss": 1.0754, "step": 24000 }, { "epoch": 0.98, "eval_accuracy": 0.4245508982035928, "eval_loss": 1.0589721202850342, "eval_runtime": 20.9053, "eval_samples_per_second": 239.652, "eval_steps_per_second": 29.992, "step": 24000 }, { "epoch": 1.02, "learning_rate": 6.941666666666667e-07, "loss": 1.0736, "step": 25000 }, { "epoch": 1.02, "eval_accuracy": 0.43213572854291415, "eval_loss": 1.0583962202072144, "eval_runtime": 21.3265, "eval_samples_per_second": 234.919, "eval_steps_per_second": 29.4, "step": 25000 }, { "epoch": 1.06, "learning_rate": 7.219444444444444e-07, "loss": 1.0717, "step": 26000 }, { "epoch": 1.06, "eval_accuracy": 0.4305389221556886, "eval_loss": 1.0561293363571167, "eval_runtime": 21.3034, "eval_samples_per_second": 235.174, "eval_steps_per_second": 29.432, "step": 26000 }, { "epoch": 1.1, "learning_rate": 7.496944444444444e-07, "loss": 1.0709, "step": 27000 }, { "epoch": 1.1, "eval_accuracy": 0.4281437125748503, "eval_loss": 1.0555357933044434, "eval_runtime": 21.2178, "eval_samples_per_second": 236.123, "eval_steps_per_second": 29.551, "step": 27000 }, { "epoch": 1.14, "learning_rate": 7.774722222222223e-07, "loss": 1.0701, "step": 28000 }, { "epoch": 1.14, "eval_accuracy": 0.4217564870259481, "eval_loss": 1.054961085319519, "eval_runtime": 21.1775, "eval_samples_per_second": 236.571, "eval_steps_per_second": 29.607, "step": 28000 }, { "epoch": 1.18, "learning_rate": 8.052222222222223e-07, "loss": 1.0641, "step": 29000 }, { "epoch": 1.18, "eval_accuracy": 0.4291417165668663, "eval_loss": 1.0518379211425781, "eval_runtime": 21.0932, "eval_samples_per_second": 237.517, "eval_steps_per_second": 29.725, "step": 29000 }, { "epoch": 1.22, "learning_rate": 8.330000000000001e-07, "loss": 1.064, "step": 30000 }, { "epoch": 1.22, "eval_accuracy": 0.43173652694610776, "eval_loss": 1.0493717193603516, "eval_runtime": 21.2843, "eval_samples_per_second": 235.385, "eval_steps_per_second": 29.458, "step": 30000 }, { "epoch": 1.26, "learning_rate": 8.607500000000001e-07, "loss": 1.0693, "step": 31000 }, { "epoch": 1.26, "eval_accuracy": 0.4291417165668663, "eval_loss": 1.0521764755249023, "eval_runtime": 21.2278, "eval_samples_per_second": 236.011, "eval_steps_per_second": 29.537, "step": 31000 }, { "epoch": 1.3, "learning_rate": 8.885277777777779e-07, "loss": 1.0649, "step": 32000 }, { "epoch": 1.3, "eval_accuracy": 0.42375249500998, "eval_loss": 1.0528494119644165, "eval_runtime": 21.1249, "eval_samples_per_second": 237.161, "eval_steps_per_second": 29.681, "step": 32000 }, { "epoch": 1.34, "learning_rate": 9.163055555555556e-07, "loss": 1.0619, "step": 33000 }, { "epoch": 1.34, "eval_accuracy": 0.43293413173652695, "eval_loss": 1.049193263053894, "eval_runtime": 21.241, "eval_samples_per_second": 235.865, "eval_steps_per_second": 29.518, "step": 33000 }, { "epoch": 1.39, "learning_rate": 9.440555555555557e-07, "loss": 1.0582, "step": 34000 }, { "epoch": 1.39, "eval_accuracy": 0.4357285429141717, "eval_loss": 1.04512619972229, "eval_runtime": 21.2531, "eval_samples_per_second": 235.73, "eval_steps_per_second": 29.502, "step": 34000 }, { "epoch": 1.43, "learning_rate": 9.718333333333334e-07, "loss": 1.0629, "step": 35000 }, { "epoch": 1.43, "eval_accuracy": 0.43253493013972055, "eval_loss": 1.043523907661438, "eval_runtime": 21.2176, "eval_samples_per_second": 236.124, "eval_steps_per_second": 29.551, "step": 35000 }, { "epoch": 1.47, "learning_rate": 9.995833333333334e-07, "loss": 1.0588, "step": 36000 }, { "epoch": 1.47, "eval_accuracy": 0.4307385229540918, "eval_loss": 1.0413768291473389, "eval_runtime": 21.2225, "eval_samples_per_second": 236.07, "eval_steps_per_second": 29.544, "step": 36000 }, { "epoch": 1.51, "learning_rate": 1.0273611111111112e-06, "loss": 1.0552, "step": 37000 }, { "epoch": 1.51, "eval_accuracy": 0.4415169660678643, "eval_loss": 1.0397700071334839, "eval_runtime": 21.0651, "eval_samples_per_second": 237.835, "eval_steps_per_second": 29.765, "step": 37000 }, { "epoch": 1.55, "learning_rate": 1.055138888888889e-06, "loss": 1.0567, "step": 38000 }, { "epoch": 1.55, "eval_accuracy": 0.4419161676646707, "eval_loss": 1.0390877723693848, "eval_runtime": 21.2858, "eval_samples_per_second": 235.368, "eval_steps_per_second": 29.456, "step": 38000 }, { "epoch": 1.59, "learning_rate": 1.0829166666666667e-06, "loss": 1.054, "step": 39000 }, { "epoch": 1.59, "eval_accuracy": 0.43952095808383235, "eval_loss": 1.0404103994369507, "eval_runtime": 21.2544, "eval_samples_per_second": 235.715, "eval_steps_per_second": 29.5, "step": 39000 }, { "epoch": 1.63, "learning_rate": 1.1106666666666668e-06, "loss": 1.049, "step": 40000 }, { "epoch": 1.63, "eval_accuracy": 0.4477045908183633, "eval_loss": 1.0360453128814697, "eval_runtime": 21.0993, "eval_samples_per_second": 237.448, "eval_steps_per_second": 29.717, "step": 40000 }, { "epoch": 1.67, "learning_rate": 1.1384444444444446e-06, "loss": 1.0522, "step": 41000 }, { "epoch": 1.67, "eval_accuracy": 0.443313373253493, "eval_loss": 1.0359567403793335, "eval_runtime": 21.2828, "eval_samples_per_second": 235.401, "eval_steps_per_second": 29.46, "step": 41000 }, { "epoch": 1.71, "learning_rate": 1.1661944444444447e-06, "loss": 1.0459, "step": 42000 }, { "epoch": 1.71, "eval_accuracy": 0.437125748502994, "eval_loss": 1.0328505039215088, "eval_runtime": 21.2626, "eval_samples_per_second": 235.625, "eval_steps_per_second": 29.488, "step": 42000 }, { "epoch": 1.75, "learning_rate": 1.1939722222222222e-06, "loss": 1.0488, "step": 43000 }, { "epoch": 1.75, "eval_accuracy": 0.4379241516966068, "eval_loss": 1.0460196733474731, "eval_runtime": 21.2023, "eval_samples_per_second": 236.295, "eval_steps_per_second": 29.572, "step": 43000 } ], "logging_steps": 1000, "max_steps": 10000000, "num_train_epochs": 408, "save_steps": 1000, "total_flos": 2.9963292961406976e+16, "trial_name": null, "trial_params": null }