{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3, "global_step": 72, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.125, "grad_norm": 85.74043273925781, "learning_rate": 2.4935958926171335e-06, "loss": 2.3489, "step": 3 }, { "epoch": 0.125, "eval_loss": 1.543222427368164, "eval_runtime": 54.7414, "eval_samples_per_second": 5.48, "eval_steps_per_second": 0.091, "step": 3 }, { "epoch": 0.25, "grad_norm": 95.90210723876953, "learning_rate": 2.3851786798946492e-06, "loss": 2.0111, "step": 6 }, { "epoch": 0.25, "eval_loss": 1.487854242324829, "eval_runtime": 55.6199, "eval_samples_per_second": 5.394, "eval_steps_per_second": 0.09, "step": 6 }, { "epoch": 0.375, "grad_norm": 91.1529769897461, "learning_rate": 2.276761467172165e-06, "loss": 1.9639, "step": 9 }, { "epoch": 0.375, "eval_loss": 1.4351141452789307, "eval_runtime": 54.5584, "eval_samples_per_second": 5.499, "eval_steps_per_second": 0.092, "step": 9 }, { "epoch": 0.5, "grad_norm": 85.8731918334961, "learning_rate": 2.168344254449681e-06, "loss": 1.8783, "step": 12 }, { "epoch": 0.5, "eval_loss": 1.3963404893875122, "eval_runtime": 72.7982, "eval_samples_per_second": 4.121, "eval_steps_per_second": 0.069, "step": 12 }, { "epoch": 0.625, "grad_norm": 71.83324432373047, "learning_rate": 2.059927041727197e-06, "loss": 2.0128, "step": 15 }, { "epoch": 0.625, "eval_loss": 1.3604711294174194, "eval_runtime": 55.05, "eval_samples_per_second": 5.45, "eval_steps_per_second": 0.091, "step": 15 }, { "epoch": 0.75, "grad_norm": 79.30952453613281, "learning_rate": 1.9515098290047127e-06, "loss": 2.1604, "step": 18 }, { "epoch": 0.75, "eval_loss": 1.3254843950271606, "eval_runtime": 55.0752, "eval_samples_per_second": 5.447, "eval_steps_per_second": 0.091, "step": 18 }, { "epoch": 0.875, "grad_norm": 75.95362091064453, "learning_rate": 1.843092616282229e-06, "loss": 1.8474, "step": 21 }, { "epoch": 0.875, "eval_loss": 1.302057147026062, "eval_runtime": 56.2737, "eval_samples_per_second": 5.331, "eval_steps_per_second": 0.089, "step": 21 }, { "epoch": 1.0, "grad_norm": 91.8495101928711, "learning_rate": 1.7346754035597447e-06, "loss": 1.6354, "step": 24 }, { "epoch": 1.0, "eval_loss": 1.2863010168075562, "eval_runtime": 54.0492, "eval_samples_per_second": 5.55, "eval_steps_per_second": 0.093, "step": 24 }, { "epoch": 1.125, "grad_norm": 54.94986343383789, "learning_rate": 1.6262581908372609e-06, "loss": 1.2718, "step": 27 }, { "epoch": 1.125, "eval_loss": 1.275192141532898, "eval_runtime": 65.5923, "eval_samples_per_second": 4.574, "eval_steps_per_second": 0.076, "step": 27 }, { "epoch": 1.25, "grad_norm": 58.67612075805664, "learning_rate": 1.5178409781147769e-06, "loss": 1.3576, "step": 30 }, { "epoch": 1.25, "eval_loss": 1.2581545114517212, "eval_runtime": 54.0691, "eval_samples_per_second": 5.548, "eval_steps_per_second": 0.092, "step": 30 }, { "epoch": 1.375, "grad_norm": 46.77408218383789, "learning_rate": 1.4094237653922926e-06, "loss": 1.0203, "step": 33 }, { "epoch": 1.375, "eval_loss": 1.2414389848709106, "eval_runtime": 65.9072, "eval_samples_per_second": 4.552, "eval_steps_per_second": 0.076, "step": 33 }, { "epoch": 1.5, "grad_norm": 56.48828125, "learning_rate": 1.3010065526698086e-06, "loss": 1.3456, "step": 36 }, { "epoch": 1.5, "eval_loss": 1.2292382717132568, "eval_runtime": 65.3296, "eval_samples_per_second": 4.592, "eval_steps_per_second": 0.077, "step": 36 }, { "epoch": 1.625, "grad_norm": 60.266910552978516, "learning_rate": 1.1925893399473246e-06, "loss": 1.3943, "step": 39 }, { "epoch": 1.625, "eval_loss": 1.2222548723220825, "eval_runtime": 53.6967, "eval_samples_per_second": 5.587, "eval_steps_per_second": 0.093, "step": 39 }, { "epoch": 1.75, "grad_norm": 57.817108154296875, "learning_rate": 1.0841721272248406e-06, "loss": 1.1913, "step": 42 }, { "epoch": 1.75, "eval_loss": 1.2170686721801758, "eval_runtime": 53.4662, "eval_samples_per_second": 5.611, "eval_steps_per_second": 0.094, "step": 42 }, { "epoch": 1.875, "grad_norm": 57.16936492919922, "learning_rate": 9.757549145023564e-07, "loss": 1.2778, "step": 45 }, { "epoch": 1.875, "eval_loss": 1.2100152969360352, "eval_runtime": 53.4914, "eval_samples_per_second": 5.608, "eval_steps_per_second": 0.093, "step": 45 }, { "epoch": 2.0, "grad_norm": 95.0725326538086, "learning_rate": 8.673377017798723e-07, "loss": 1.1118, "step": 48 }, { "epoch": 2.0, "eval_loss": 1.2054405212402344, "eval_runtime": 54.251, "eval_samples_per_second": 5.53, "eval_steps_per_second": 0.092, "step": 48 }, { "epoch": 2.125, "grad_norm": 43.22427749633789, "learning_rate": 7.589204890573884e-07, "loss": 1.1816, "step": 51 }, { "epoch": 2.125, "eval_loss": 1.2018208503723145, "eval_runtime": 53.8326, "eval_samples_per_second": 5.573, "eval_steps_per_second": 0.093, "step": 51 }, { "epoch": 2.25, "grad_norm": 49.27873229980469, "learning_rate": 6.505032763349043e-07, "loss": 1.087, "step": 54 }, { "epoch": 2.25, "eval_loss": 1.1997495889663696, "eval_runtime": 55.39, "eval_samples_per_second": 5.416, "eval_steps_per_second": 0.09, "step": 54 }, { "epoch": 2.375, "grad_norm": 54.04515838623047, "learning_rate": 5.420860636124203e-07, "loss": 0.9709, "step": 57 }, { "epoch": 2.375, "eval_loss": 1.1992498636245728, "eval_runtime": 54.7201, "eval_samples_per_second": 5.482, "eval_steps_per_second": 0.091, "step": 57 }, { "epoch": 2.5, "grad_norm": 50.179283142089844, "learning_rate": 4.336688508899362e-07, "loss": 0.9793, "step": 60 }, { "epoch": 2.5, "eval_loss": 1.1989630460739136, "eval_runtime": 54.701, "eval_samples_per_second": 5.484, "eval_steps_per_second": 0.091, "step": 60 }, { "epoch": 2.625, "grad_norm": 48.734962463378906, "learning_rate": 3.2525163816745216e-07, "loss": 1.0414, "step": 63 }, { "epoch": 2.625, "eval_loss": 1.1992130279541016, "eval_runtime": 53.7456, "eval_samples_per_second": 5.582, "eval_steps_per_second": 0.093, "step": 63 }, { "epoch": 2.75, "grad_norm": 49.651309967041016, "learning_rate": 2.168344254449681e-07, "loss": 0.8422, "step": 66 }, { "epoch": 2.75, "eval_loss": 1.1986463069915771, "eval_runtime": 53.6334, "eval_samples_per_second": 5.594, "eval_steps_per_second": 0.093, "step": 66 }, { "epoch": 2.875, "grad_norm": 48.41971969604492, "learning_rate": 1.0841721272248404e-07, "loss": 0.9034, "step": 69 }, { "epoch": 2.875, "eval_loss": 1.1979520320892334, "eval_runtime": 61.544, "eval_samples_per_second": 4.875, "eval_steps_per_second": 0.081, "step": 69 }, { "epoch": 3.0, "grad_norm": 69.29705810546875, "learning_rate": 0.0, "loss": 0.8757, "step": 72 }, { "epoch": 3.0, "eval_loss": 1.197731375694275, "eval_runtime": 54.2475, "eval_samples_per_second": 5.53, "eval_steps_per_second": 0.092, "step": 72 } ], "logging_steps": 3, "max_steps": 72, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 523328480700102.0, "train_batch_size": 128, "trial_name": null, "trial_params": { "_wandb": {}, "assignments": {}, "decay": 0.1, "learning_rate": 2.6020131053396173e-06, "metric": "eval/loss", "per_device_train_batch_size": 128 } }