{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5374, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018608113137327874, "grad_norm": 0.15951524674892426, "learning_rate": 2.0000000000000003e-06, "loss": 2.4521, "step": 100 }, { "epoch": 0.03721622627465575, "grad_norm": 0.10372528433799744, "learning_rate": 4.000000000000001e-06, "loss": 2.4034, "step": 200 }, { "epoch": 0.05582433941198362, "grad_norm": 0.20789675414562225, "learning_rate": 6e-06, "loss": 2.4476, "step": 300 }, { "epoch": 0.0744324525493115, "grad_norm": 0.2809026539325714, "learning_rate": 8.000000000000001e-06, "loss": 2.4063, "step": 400 }, { "epoch": 0.09304056568663938, "grad_norm": 0.3346979320049286, "learning_rate": 1e-05, "loss": 2.3631, "step": 500 }, { "epoch": 0.11164867882396724, "grad_norm": 0.42336997389793396, "learning_rate": 1.2e-05, "loss": 2.3173, "step": 600 }, { "epoch": 0.13025679196129514, "grad_norm": 0.4591374099254608, "learning_rate": 1.4e-05, "loss": 2.2564, "step": 700 }, { "epoch": 0.148864905098623, "grad_norm": 0.5659797787666321, "learning_rate": 1.6000000000000003e-05, "loss": 2.273, "step": 800 }, { "epoch": 0.16747301823595087, "grad_norm": 0.5892286896705627, "learning_rate": 1.8e-05, "loss": 2.2587, "step": 900 }, { "epoch": 0.18608113137327875, "grad_norm": 0.5323100686073303, "learning_rate": 2e-05, "loss": 2.1991, "step": 1000 }, { "epoch": 0.20468924451060663, "grad_norm": 0.6430222988128662, "learning_rate": 1.9974217472841614e-05, "loss": 2.1777, "step": 1100 }, { "epoch": 0.22329735764793449, "grad_norm": 0.6999238133430481, "learning_rate": 1.9897002839107783e-05, "loss": 2.1297, "step": 1200 }, { "epoch": 0.24190547078526237, "grad_norm": 0.8810726404190063, "learning_rate": 1.9768754256476755e-05, "loss": 2.17, "step": 1300 }, { "epoch": 0.2605135839225903, "grad_norm": 0.8605418801307678, "learning_rate": 1.9590133039461487e-05, "loss": 2.13, "step": 1400 }, { "epoch": 0.27912169705991813, "grad_norm": 0.711585521697998, "learning_rate": 1.936206024933772e-05, "loss": 2.162, "step": 1500 }, { "epoch": 0.297729810197246, "grad_norm": 0.8360315561294556, "learning_rate": 1.908571194468655e-05, "loss": 2.165, "step": 1600 }, { "epoch": 0.3163379233345739, "grad_norm": 0.9491225481033325, "learning_rate": 1.8762513117041943e-05, "loss": 2.0905, "step": 1700 }, { "epoch": 0.33494603647190174, "grad_norm": 0.6673754453659058, "learning_rate": 1.839413034291416e-05, "loss": 2.1022, "step": 1800 }, { "epoch": 0.35355414960922965, "grad_norm": 0.8447780013084412, "learning_rate": 1.7982463190078928e-05, "loss": 2.1088, "step": 1900 }, { "epoch": 0.3721622627465575, "grad_norm": 0.758438229560852, "learning_rate": 1.752963442244589e-05, "loss": 2.1265, "step": 2000 }, { "epoch": 0.39077037588388536, "grad_norm": 1.1905587911605835, "learning_rate": 1.703797905401496e-05, "loss": 2.0621, "step": 2100 }, { "epoch": 0.40937848902121327, "grad_norm": 0.8596579432487488, "learning_rate": 1.6510032308363964e-05, "loss": 2.0768, "step": 2200 }, { "epoch": 0.4279866021585411, "grad_norm": 0.8008110523223877, "learning_rate": 1.5948516545754497e-05, "loss": 2.134, "step": 2300 }, { "epoch": 0.44659471529586897, "grad_norm": 0.7902112007141113, "learning_rate": 1.5356327225266418e-05, "loss": 2.0793, "step": 2400 }, { "epoch": 0.4652028284331969, "grad_norm": 1.021168828010559, "learning_rate": 1.4736517974347408e-05, "loss": 2.0917, "step": 2500 }, { "epoch": 0.48381094157052473, "grad_norm": 1.0414773225784302, "learning_rate": 1.4092284842766439e-05, "loss": 2.0306, "step": 2600 }, { "epoch": 0.5024190547078526, "grad_norm": 0.8652853965759277, "learning_rate": 1.3426949822165768e-05, "loss": 2.0742, "step": 2700 }, { "epoch": 0.5210271678451805, "grad_norm": 0.8237012624740601, "learning_rate": 1.2743943716193017e-05, "loss": 2.0428, "step": 2800 }, { "epoch": 0.5396352809825083, "grad_norm": 1.091333270072937, "learning_rate": 1.2046788449543496e-05, "loss": 2.0632, "step": 2900 }, { "epoch": 0.5582433941198363, "grad_norm": 0.9181482791900635, "learning_rate": 1.1339078907136409e-05, "loss": 1.9869, "step": 3000 }, { "epoch": 0.5768515072571642, "grad_norm": 1.1163825988769531, "learning_rate": 1.0624464397071229e-05, "loss": 2.0724, "step": 3100 }, { "epoch": 0.595459620394492, "grad_norm": 1.086317539215088, "learning_rate": 9.906629832950659e-06, "loss": 2.0576, "step": 3200 }, { "epoch": 0.6140677335318199, "grad_norm": 0.822471559047699, "learning_rate": 9.189276732603637e-06, "loss": 2.1059, "step": 3300 }, { "epoch": 0.6326758466691478, "grad_norm": 0.9485145807266235, "learning_rate": 8.47610413118853e-06, "loss": 2.0566, "step": 3400 }, { "epoch": 0.6512839598064756, "grad_norm": 0.9303804039955139, "learning_rate": 7.770789507098263e-06, "loss": 2.0242, "step": 3500 }, { "epoch": 0.6698920729438035, "grad_norm": 1.1080023050308228, "learning_rate": 7.076969819022999e-06, "loss": 1.9736, "step": 3600 }, { "epoch": 0.6885001860811314, "grad_norm": 1.0893449783325195, "learning_rate": 6.3982227519528986e-06, "loss": 2.0214, "step": 3700 }, { "epoch": 0.7071082992184593, "grad_norm": 0.9729887843132019, "learning_rate": 5.738048268826046e-06, "loss": 2.0238, "step": 3800 }, { "epoch": 0.7257164123557871, "grad_norm": 0.7429510951042175, "learning_rate": 5.099850562950539e-06, "loss": 2.0608, "step": 3900 }, { "epoch": 0.744324525493115, "grad_norm": 1.201204538345337, "learning_rate": 4.486920504263212e-06, "loss": 1.9825, "step": 4000 }, { "epoch": 0.7629326386304429, "grad_norm": 0.9092488884925842, "learning_rate": 3.902418669940925e-06, "loss": 2.0295, "step": 4100 }, { "epoch": 0.7815407517677707, "grad_norm": 1.1268125772476196, "learning_rate": 3.3493590468671868e-06, "loss": 2.029, "step": 4200 }, { "epoch": 0.8001488649050986, "grad_norm": 0.817008376121521, "learning_rate": 2.8305934899924135e-06, "loss": 2.0129, "step": 4300 }, { "epoch": 0.8187569780424265, "grad_norm": 0.9579901099205017, "learning_rate": 2.348797016728398e-06, "loss": 2.0472, "step": 4400 }, { "epoch": 0.8373650911797543, "grad_norm": 0.8800289034843445, "learning_rate": 1.9064540132064946e-06, "loss": 1.959, "step": 4500 }, { "epoch": 0.8559732043170822, "grad_norm": 0.9605334997177124, "learning_rate": 1.505845423527027e-06, "loss": 1.9486, "step": 4600 }, { "epoch": 0.8745813174544101, "grad_norm": 0.7564200162887573, "learning_rate": 1.1490369880586516e-06, "loss": 2.0324, "step": 4700 }, { "epoch": 0.8931894305917379, "grad_norm": 1.0660285949707031, "learning_rate": 8.378685914369323e-07, "loss": 2.0176, "step": 4800 }, { "epoch": 0.9117975437290659, "grad_norm": 1.1286784410476685, "learning_rate": 5.739447751892135e-07, "loss": 2.0409, "step": 4900 }, { "epoch": 0.9304056568663938, "grad_norm": 0.9878057241439819, "learning_rate": 3.586264639075265e-07, "loss": 1.967, "step": 5000 }, { "epoch": 0.9490137700037217, "grad_norm": 0.9573999643325806, "learning_rate": 1.9302394763353606e-07, "loss": 2.0823, "step": 5100 }, { "epoch": 0.9676218831410495, "grad_norm": 0.7985095381736755, "learning_rate": 7.79911566419056e-08, "loss": 2.0383, "step": 5200 }, { "epoch": 0.9862299962783774, "grad_norm": 1.0881154537200928, "learning_rate": 1.4121258144207395e-08, "loss": 2.0119, "step": 5300 }, { "epoch": 1.0, "step": 5374, "total_flos": 9.788605661184e+16, "train_loss": 2.1113919824927456, "train_runtime": 862.752, "train_samples_per_second": 12.458, "train_steps_per_second": 6.229 } ], "logging_steps": 100, "max_steps": 5374, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.788605661184e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }