{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7836990595611285, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001567398119122257, "grad_norm": 0.5552594661712646, "learning_rate": 3.448275862068966e-06, "loss": 3.3921, "step": 1 }, { "epoch": 0.01567398119122257, "grad_norm": 0.660531222820282, "learning_rate": 3.4482758620689657e-05, "loss": 3.5922, "step": 10 }, { "epoch": 0.03134796238244514, "grad_norm": 0.5598097443580627, "learning_rate": 6.896551724137931e-05, "loss": 3.619, "step": 20 }, { "epoch": 0.047021943573667714, "grad_norm": 0.5996779799461365, "learning_rate": 0.00010344827586206898, "loss": 3.4824, "step": 30 }, { "epoch": 0.06269592476489028, "grad_norm": 0.8172075152397156, "learning_rate": 0.00013793103448275863, "loss": 3.3997, "step": 40 }, { "epoch": 0.07836990595611286, "grad_norm": 1.6439019441604614, "learning_rate": 0.00017241379310344826, "loss": 3.2485, "step": 50 }, { "epoch": 0.09404388714733543, "grad_norm": 0.28180772066116333, "learning_rate": 0.00019999942697524717, "loss": 2.9975, "step": 60 }, { "epoch": 0.109717868338558, "grad_norm": 0.3870689868927002, "learning_rate": 0.00019997937179843937, "loss": 3.0446, "step": 70 }, { "epoch": 0.12539184952978055, "grad_norm": 0.575548529624939, "learning_rate": 0.00019993067195079803, "loss": 3.0178, "step": 80 }, { "epoch": 0.14106583072100312, "grad_norm": 0.781446635723114, "learning_rate": 0.00019985334138511237, "loss": 3.0394, "step": 90 }, { "epoch": 0.15673981191222572, "grad_norm": 1.725696325302124, "learning_rate": 0.00019974740225703878, "loss": 3.0751, "step": 100 }, { "epoch": 0.1724137931034483, "grad_norm": 0.2845414876937866, "learning_rate": 0.00019961288491875278, "loss": 2.9291, "step": 110 }, { "epoch": 0.18808777429467086, "grad_norm": 0.36810651421546936, "learning_rate": 0.00019944982791025333, "loss": 2.9491, "step": 120 }, { "epoch": 0.20376175548589343, "grad_norm": 0.5454439520835876, "learning_rate": 0.00019925827794832056, "loss": 3.0337, "step": 130 }, { "epoch": 0.219435736677116, "grad_norm": 0.6503669619560242, "learning_rate": 0.00019903828991313138, "loss": 3.0246, "step": 140 }, { "epoch": 0.23510971786833856, "grad_norm": 1.4392451047897339, "learning_rate": 0.00019878992683253582, "loss": 3.0232, "step": 150 }, { "epoch": 0.2507836990595611, "grad_norm": 0.2859440743923187, "learning_rate": 0.00019851325986399934, "loss": 2.8955, "step": 160 }, { "epoch": 0.2664576802507837, "grad_norm": 0.44268104434013367, "learning_rate": 0.0001982083682742156, "loss": 2.9338, "step": 170 }, { "epoch": 0.28213166144200624, "grad_norm": 0.5128395557403564, "learning_rate": 0.00019787533941639638, "loss": 3.0089, "step": 180 }, { "epoch": 0.29780564263322884, "grad_norm": 0.7328920364379883, "learning_rate": 0.00019751426870524407, "loss": 3.0157, "step": 190 }, { "epoch": 0.31347962382445144, "grad_norm": 1.5265012979507446, "learning_rate": 0.000197125259589615, "loss": 2.9007, "step": 200 }, { "epoch": 0.329153605015674, "grad_norm": 0.2766813635826111, "learning_rate": 0.0001967084235228807, "loss": 2.8275, "step": 210 }, { "epoch": 0.3448275862068966, "grad_norm": 0.36695396900177, "learning_rate": 0.00019626387993099579, "loss": 2.9158, "step": 220 }, { "epoch": 0.3605015673981191, "grad_norm": 0.5359162092208862, "learning_rate": 0.00019579175617828187, "loss": 2.9465, "step": 230 }, { "epoch": 0.3761755485893417, "grad_norm": 0.6529833674430847, "learning_rate": 0.0001952921875309368, "loss": 2.981, "step": 240 }, { "epoch": 0.39184952978056425, "grad_norm": 1.5314627885818481, "learning_rate": 0.00019476531711828027, "loss": 2.9737, "step": 250 }, { "epoch": 0.40752351097178685, "grad_norm": 0.2949506342411041, "learning_rate": 0.00019421129589174618, "loss": 2.8208, "step": 260 }, { "epoch": 0.4231974921630094, "grad_norm": 0.39567869901657104, "learning_rate": 0.00019363028258163447, "loss": 2.8557, "step": 270 }, { "epoch": 0.438871473354232, "grad_norm": 0.5587254166603088, "learning_rate": 0.00019302244365163376, "loss": 2.9494, "step": 280 }, { "epoch": 0.45454545454545453, "grad_norm": 0.7218978404998779, "learning_rate": 0.0001923879532511287, "loss": 2.9742, "step": 290 }, { "epoch": 0.4702194357366771, "grad_norm": 1.4482598304748535, "learning_rate": 0.0001917269931653049, "loss": 2.8646, "step": 300 }, { "epoch": 0.48589341692789967, "grad_norm": 0.2901701033115387, "learning_rate": 0.00019103975276306678, "loss": 2.7788, "step": 310 }, { "epoch": 0.5015673981191222, "grad_norm": 0.4310539960861206, "learning_rate": 0.00019032642894278192, "loss": 2.8655, "step": 320 }, { "epoch": 0.5172413793103449, "grad_norm": 0.5589954853057861, "learning_rate": 0.0001895872260758688, "loss": 2.914, "step": 330 }, { "epoch": 0.5329153605015674, "grad_norm": 0.7243526577949524, "learning_rate": 0.00018882235594824308, "loss": 2.9191, "step": 340 }, { "epoch": 0.54858934169279, "grad_norm": 1.4200222492218018, "learning_rate": 0.00018803203769963967, "loss": 2.8128, "step": 350 }, { "epoch": 0.5642633228840125, "grad_norm": 0.26594147086143494, "learning_rate": 0.000187216497760828, "loss": 2.762, "step": 360 }, { "epoch": 0.5799373040752351, "grad_norm": 0.3894258439540863, "learning_rate": 0.00018637596978873835, "loss": 2.9077, "step": 370 }, { "epoch": 0.5956112852664577, "grad_norm": 0.5348561406135559, "learning_rate": 0.00018551069459951758, "loss": 2.9292, "step": 380 }, { "epoch": 0.6112852664576802, "grad_norm": 0.746507465839386, "learning_rate": 0.00018462092009953408, "loss": 2.8795, "step": 390 }, { "epoch": 0.6269592476489029, "grad_norm": 1.5225753784179688, "learning_rate": 0.0001837069012143511, "loss": 2.8263, "step": 400 }, { "epoch": 0.6426332288401254, "grad_norm": 0.26905450224876404, "learning_rate": 0.00018276889981568906, "loss": 2.7218, "step": 410 }, { "epoch": 0.658307210031348, "grad_norm": 0.3912515342235565, "learning_rate": 0.00018180718464639787, "loss": 2.819, "step": 420 }, { "epoch": 0.6739811912225705, "grad_norm": 0.5661373138427734, "learning_rate": 0.00018082203124346045, "loss": 2.8772, "step": 430 }, { "epoch": 0.6896551724137931, "grad_norm": 0.805776059627533, "learning_rate": 0.0001798137218590498, "loss": 2.9562, "step": 440 }, { "epoch": 0.7053291536050157, "grad_norm": 1.4879732131958008, "learning_rate": 0.00017878254537966216, "loss": 2.7925, "step": 450 }, { "epoch": 0.7210031347962382, "grad_norm": 0.28762391209602356, "learning_rate": 0.00017772879724334937, "loss": 2.8006, "step": 460 }, { "epoch": 0.7366771159874608, "grad_norm": 0.41769474744796753, "learning_rate": 0.00017665277935507398, "loss": 2.8148, "step": 470 }, { "epoch": 0.7523510971786834, "grad_norm": 0.633671760559082, "learning_rate": 0.00017555480000021198, "loss": 2.8461, "step": 480 }, { "epoch": 0.768025078369906, "grad_norm": 0.816681444644928, "learning_rate": 0.00017443517375622704, "loss": 2.8826, "step": 490 }, { "epoch": 0.7836990595611285, "grad_norm": 1.3913438320159912, "learning_rate": 0.00017329422140254235, "loss": 2.7449, "step": 500 } ], "logging_steps": 10, "max_steps": 1914, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.158266300583117e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }