{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9966329966329966, "eval_steps": 500, "global_step": 148, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006734006734006734, "grad_norm": 74.241455078125, "learning_rate": 1.3333333333333333e-05, "loss": 2.5135, "step": 1 }, { "epoch": 0.03367003367003367, "grad_norm": 3.1438040733337402, "learning_rate": 6.666666666666667e-05, "loss": 2.4726, "step": 5 }, { "epoch": 0.06734006734006734, "grad_norm": 1.2921487092971802, "learning_rate": 0.00013333333333333334, "loss": 2.3003, "step": 10 }, { "epoch": 0.10101010101010101, "grad_norm": 1.0113569498062134, "learning_rate": 0.0002, "loss": 2.1321, "step": 15 }, { "epoch": 0.13468013468013468, "grad_norm": 2.183227777481079, "learning_rate": 0.00019930337092856243, "loss": 1.9945, "step": 20 }, { "epoch": 0.16835016835016836, "grad_norm": 0.47641709446907043, "learning_rate": 0.00019722318955551306, "loss": 1.8885, "step": 25 }, { "epoch": 0.20202020202020202, "grad_norm": 0.7186397910118103, "learning_rate": 0.00019378843817721854, "loss": 1.8266, "step": 30 }, { "epoch": 0.2356902356902357, "grad_norm": 0.37949636578559875, "learning_rate": 0.00018904697174694447, "loss": 1.7831, "step": 35 }, { "epoch": 0.26936026936026936, "grad_norm": 0.5123534202575684, "learning_rate": 0.0001830648511318223, "loss": 1.7567, "step": 40 }, { "epoch": 0.30303030303030304, "grad_norm": 0.5447937250137329, "learning_rate": 0.00017592542271443887, "loss": 1.7298, "step": 45 }, { "epoch": 0.3367003367003367, "grad_norm": 0.202178493142128, "learning_rate": 0.00016772815716257412, "loss": 1.7114, "step": 50 }, { "epoch": 0.37037037037037035, "grad_norm": 0.3912014663219452, "learning_rate": 0.00015858726354602248, "loss": 1.7067, "step": 55 }, { "epoch": 0.40404040404040403, "grad_norm": 0.2787935137748718, "learning_rate": 0.00014863009810942815, "loss": 1.6797, "step": 60 }, { "epoch": 0.4377104377104377, "grad_norm": 0.1719539761543274, "learning_rate": 0.000137995389871036, "loss": 1.6721, "step": 65 }, { "epoch": 0.4713804713804714, "grad_norm": 0.2619878053665161, "learning_rate": 0.0001268313077693485, "loss": 1.6664, "step": 70 }, { "epoch": 0.5050505050505051, "grad_norm": 0.268819659948349, "learning_rate": 0.0001152933962873246, "loss": 1.6685, "step": 75 }, { "epoch": 0.5387205387205387, "grad_norm": 0.1786731630563736, "learning_rate": 0.00010354240831620541, "loss": 1.6523, "step": 80 }, { "epoch": 0.5723905723905723, "grad_norm": 0.1858537644147873, "learning_rate": 9.174206545276677e-05, "loss": 1.6465, "step": 85 }, { "epoch": 0.6060606060606061, "grad_norm": 0.25154948234558105, "learning_rate": 8.005677693484077e-05, "loss": 1.6344, "step": 90 }, { "epoch": 0.6397306397306397, "grad_norm": 0.1697424203157425, "learning_rate": 6.864934899622191e-05, "loss": 1.6472, "step": 95 }, { "epoch": 0.6734006734006734, "grad_norm": 0.1832387000322342, "learning_rate": 5.767871655555751e-05, "loss": 1.6354, "step": 100 }, { "epoch": 0.7070707070707071, "grad_norm": 0.1768219918012619, "learning_rate": 4.729772884265212e-05, "loss": 1.6274, "step": 105 }, { "epoch": 0.7407407407407407, "grad_norm": 0.17322488129138947, "learning_rate": 3.7651019814126654e-05, "loss": 1.6235, "step": 110 }, { "epoch": 0.7744107744107744, "grad_norm": 0.1708805114030838, "learning_rate": 2.8872993029040508e-05, "loss": 1.6349, "step": 115 }, { "epoch": 0.8080808080808081, "grad_norm": 0.1731610745191574, "learning_rate": 2.1085949060360654e-05, "loss": 1.6322, "step": 120 }, { "epoch": 0.8417508417508418, "grad_norm": 0.1952390819787979, "learning_rate": 1.439838153227e-05, "loss": 1.6391, "step": 125 }, { "epoch": 0.8754208754208754, "grad_norm": 0.1885603815317154, "learning_rate": 8.903465523913957e-06, "loss": 1.6368, "step": 130 }, { "epoch": 0.9090909090909091, "grad_norm": 0.19654668867588043, "learning_rate": 4.6777594000230855e-06, "loss": 1.6307, "step": 135 }, { "epoch": 0.9427609427609428, "grad_norm": 0.17118915915489197, "learning_rate": 1.7801381552624563e-06, "loss": 1.6287, "step": 140 }, { "epoch": 0.9764309764309764, "grad_norm": 0.17712068557739258, "learning_rate": 2.509731335744281e-07, "loss": 1.632, "step": 145 }, { "epoch": 0.9966329966329966, "eval_loss": 2.3924484252929688, "eval_runtime": 0.6217, "eval_samples_per_second": 33.78, "eval_steps_per_second": 1.609, "step": 148 }, { "epoch": 0.9966329966329966, "step": 148, "total_flos": 6.55432743055786e+17, "train_loss": 1.752148318935085, "train_runtime": 587.2689, "train_samples_per_second": 48.443, "train_steps_per_second": 0.252 } ], "logging_steps": 5, "max_steps": 148, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.55432743055786e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }