|
{ |
|
"best_metric": 0.5402693748474121, |
|
"best_model_checkpoint": "data/Mistral-7B_task-2_180-samples_config-2/checkpoint-42", |
|
"epoch": 12.0, |
|
"eval_steps": 500, |
|
"global_step": 102, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 2.3155293464660645, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.0419, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 2.6562511920928955, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0989, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 2.378835916519165, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0469, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 2.085439682006836, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.9504, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 1.7603436708450317, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8897, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"eval_loss": 0.8554134964942932, |
|
"eval_runtime": 45.606, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 1.640693187713623, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.8518, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.4117647058823528, |
|
"grad_norm": 0.8897013068199158, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7464, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.6470588235294117, |
|
"grad_norm": 0.5992234349250793, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.659, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.8823529411764706, |
|
"grad_norm": 0.5794339776039124, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6578, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6425343751907349, |
|
"eval_runtime": 45.6142, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 2.1176470588235294, |
|
"grad_norm": 0.49187347292900085, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.6511, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.4880034327507019, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5877, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.588235294117647, |
|
"grad_norm": 0.4656303822994232, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.6074, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.8235294117647056, |
|
"grad_norm": 0.4449208974838257, |
|
"learning_rate": 6e-05, |
|
"loss": 0.6205, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"eval_loss": 0.595414400100708, |
|
"eval_runtime": 45.6824, |
|
"eval_samples_per_second": 0.788, |
|
"eval_steps_per_second": 0.788, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 3.0588235294117645, |
|
"grad_norm": 0.4166175127029419, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.5775, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 3.2941176470588234, |
|
"grad_norm": 0.3330927789211273, |
|
"learning_rate": 7e-05, |
|
"loss": 0.5532, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 0.37248340249061584, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.5553, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.764705882352941, |
|
"grad_norm": 0.3898817002773285, |
|
"learning_rate": 8e-05, |
|
"loss": 0.5447, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.3806105852127075, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.4973, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.5606213212013245, |
|
"eval_runtime": 45.6824, |
|
"eval_samples_per_second": 0.788, |
|
"eval_steps_per_second": 0.788, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 4.235294117647059, |
|
"grad_norm": 0.37909018993377686, |
|
"learning_rate": 9e-05, |
|
"loss": 0.5128, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 4.470588235294118, |
|
"grad_norm": 0.39016708731651306, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.4783, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 0.34981122612953186, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4516, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 4.9411764705882355, |
|
"grad_norm": 0.40651199221611023, |
|
"learning_rate": 9.999238475781957e-05, |
|
"loss": 0.4478, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 4.9411764705882355, |
|
"eval_loss": 0.5402693748474121, |
|
"eval_runtime": 45.6024, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 5.176470588235294, |
|
"grad_norm": 0.38201889395713806, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 0.4217, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 5.411764705882353, |
|
"grad_norm": 0.3878049850463867, |
|
"learning_rate": 9.99314767377287e-05, |
|
"loss": 0.4203, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 5.647058823529412, |
|
"grad_norm": 0.463470458984375, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 0.3668, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 5.882352941176471, |
|
"grad_norm": 0.5280053019523621, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 0.3946, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.5508570075035095, |
|
"eval_runtime": 45.6036, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 6.117647058823529, |
|
"grad_norm": 0.5039118528366089, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 0.344, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 6.352941176470588, |
|
"grad_norm": 0.5101126432418823, |
|
"learning_rate": 9.962730758206611e-05, |
|
"loss": 0.2839, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 6.588235294117647, |
|
"grad_norm": 0.7359801530838013, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 0.2958, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 6.823529411764706, |
|
"grad_norm": 0.7432470321655273, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 0.3109, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 6.9411764705882355, |
|
"eval_loss": 0.595815896987915, |
|
"eval_runtime": 45.6043, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 0.7509456276893616, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.2704, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 7.294117647058823, |
|
"grad_norm": 0.7170895338058472, |
|
"learning_rate": 9.908135917238321e-05, |
|
"loss": 0.2012, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 7.529411764705882, |
|
"grad_norm": 0.9948111176490784, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 0.1713, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 7.764705882352941, |
|
"grad_norm": 1.1264708042144775, |
|
"learning_rate": 9.871850323926177e-05, |
|
"loss": 0.1708, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.9287075996398926, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 0.182, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.6715656518936157, |
|
"eval_runtime": 45.6085, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 8.235294117647058, |
|
"grad_norm": 0.8156751990318298, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 0.115, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 8.470588235294118, |
|
"grad_norm": 1.2089699506759644, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 0.1075, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 8.705882352941176, |
|
"grad_norm": 1.3467602729797363, |
|
"learning_rate": 9.781523779815179e-05, |
|
"loss": 0.0964, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 8.941176470588236, |
|
"grad_norm": 0.9090074300765991, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.0682, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 8.941176470588236, |
|
"eval_loss": 0.7786654233932495, |
|
"eval_runtime": 45.6063, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 9.176470588235293, |
|
"grad_norm": 1.0487481355667114, |
|
"learning_rate": 9.727592877996585e-05, |
|
"loss": 0.066, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 1.0376051664352417, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.0467, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 9.647058823529411, |
|
"grad_norm": 1.2137938737869263, |
|
"learning_rate": 9.667902132486009e-05, |
|
"loss": 0.0452, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 9.882352941176471, |
|
"grad_norm": 1.0914534330368042, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 0.0461, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.9119131565093994, |
|
"eval_runtime": 45.6034, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 10.117647058823529, |
|
"grad_norm": 0.5998616218566895, |
|
"learning_rate": 9.602524267262203e-05, |
|
"loss": 0.0361, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 10.352941176470589, |
|
"grad_norm": 0.7274846434593201, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 0.0236, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 10.588235294117647, |
|
"grad_norm": 0.8589937090873718, |
|
"learning_rate": 9.53153893518325e-05, |
|
"loss": 0.029, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 10.823529411764707, |
|
"grad_norm": 0.9843662977218628, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 0.03, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 10.941176470588236, |
|
"eval_loss": 0.989643394947052, |
|
"eval_runtime": 45.6021, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 11.058823529411764, |
|
"grad_norm": 0.5862740874290466, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 0.0274, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 11.294117647058824, |
|
"grad_norm": 0.5251568555831909, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 0.012, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 11.529411764705882, |
|
"grad_norm": 0.4932527542114258, |
|
"learning_rate": 9.373098535696979e-05, |
|
"loss": 0.0133, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 11.764705882352942, |
|
"grad_norm": 0.8276335597038269, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.0138, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.488267183303833, |
|
"learning_rate": 9.285836503510562e-05, |
|
"loss": 0.0116, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 1.0824575424194336, |
|
"eval_runtime": 45.6139, |
|
"eval_samples_per_second": 0.789, |
|
"eval_steps_per_second": 0.789, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"step": 102, |
|
"total_flos": 1.9338296750925414e+17, |
|
"train_loss": 0.3643010138333136, |
|
"train_runtime": 7015.4881, |
|
"train_samples_per_second": 0.969, |
|
"train_steps_per_second": 0.057 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 7, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9338296750925414e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|