{ "best_metric": 0.47372516989707947, "best_model_checkpoint": "/data/jcanete/all_results/xnli/albeto_xlarge/epochs_2_bs_16_lr_5e-6/checkpoint-22000", "epoch": 2.0, "global_step": 49088, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 4.9493766297262065e-06, "loss": 1.0212, "step": 500 }, { "epoch": 0.04, "learning_rate": 4.898549543676663e-06, "loss": 0.8644, "step": 1000 }, { "epoch": 0.06, "learning_rate": 4.847722457627119e-06, "loss": 0.765, "step": 1500 }, { "epoch": 0.08, "learning_rate": 4.7967935136897e-06, "loss": 0.7077, "step": 2000 }, { "epoch": 0.08, "eval_accuracy": 0.7052208835341366, "eval_loss": 0.6958776712417603, "eval_runtime": 15.7507, "eval_samples_per_second": 158.088, "eval_steps_per_second": 9.904, "step": 2000 }, { "epoch": 0.1, "learning_rate": 4.7458645697522824e-06, "loss": 0.6694, "step": 2500 }, { "epoch": 0.12, "learning_rate": 4.694935625814864e-06, "loss": 0.6511, "step": 3000 }, { "epoch": 0.14, "learning_rate": 4.644006681877445e-06, "loss": 0.6403, "step": 3500 }, { "epoch": 0.16, "learning_rate": 4.593179595827901e-06, "loss": 0.6062, "step": 4000 }, { "epoch": 0.16, "eval_accuracy": 0.7610441767068273, "eval_loss": 0.6156352162361145, "eval_runtime": 15.8305, "eval_samples_per_second": 157.291, "eval_steps_per_second": 9.854, "step": 4000 }, { "epoch": 0.18, "learning_rate": 4.542250651890483e-06, "loss": 0.6191, "step": 4500 }, { "epoch": 0.2, "learning_rate": 4.491321707953064e-06, "loss": 0.6144, "step": 5000 }, { "epoch": 0.22, "learning_rate": 4.440392764015646e-06, "loss": 0.6089, "step": 5500 }, { "epoch": 0.24, "learning_rate": 4.389463820078227e-06, "loss": 0.584, "step": 6000 }, { "epoch": 0.24, "eval_accuracy": 0.7698795180722892, "eval_loss": 0.6093908548355103, "eval_runtime": 15.9514, "eval_samples_per_second": 156.099, "eval_steps_per_second": 9.78, "step": 6000 }, { "epoch": 0.26, "learning_rate": 4.338636734028683e-06, "loss": 0.583, "step": 6500 }, { "epoch": 0.29, "learning_rate": 4.287707790091265e-06, "loss": 0.5849, "step": 7000 }, { "epoch": 0.31, "learning_rate": 4.236778846153847e-06, "loss": 0.5819, "step": 7500 }, { "epoch": 0.33, "learning_rate": 4.185849902216428e-06, "loss": 0.5567, "step": 8000 }, { "epoch": 0.33, "eval_accuracy": 0.7718875502008032, "eval_loss": 0.5614191889762878, "eval_runtime": 15.8137, "eval_samples_per_second": 157.458, "eval_steps_per_second": 9.865, "step": 8000 }, { "epoch": 0.35, "learning_rate": 4.134920958279009e-06, "loss": 0.5554, "step": 8500 }, { "epoch": 0.37, "learning_rate": 4.083992014341591e-06, "loss": 0.5476, "step": 9000 }, { "epoch": 0.39, "learning_rate": 4.033063070404172e-06, "loss": 0.5531, "step": 9500 }, { "epoch": 0.41, "learning_rate": 3.982134126466754e-06, "loss": 0.5464, "step": 10000 }, { "epoch": 0.41, "eval_accuracy": 0.7847389558232932, "eval_loss": 0.5493358373641968, "eval_runtime": 15.7384, "eval_samples_per_second": 158.211, "eval_steps_per_second": 9.912, "step": 10000 }, { "epoch": 0.43, "learning_rate": 3.9312051825293356e-06, "loss": 0.5482, "step": 10500 }, { "epoch": 0.45, "learning_rate": 3.880378096479792e-06, "loss": 0.5451, "step": 11000 }, { "epoch": 0.47, "learning_rate": 3.8294491525423735e-06, "loss": 0.5402, "step": 11500 }, { "epoch": 0.49, "learning_rate": 3.7786220664928295e-06, "loss": 0.5393, "step": 12000 }, { "epoch": 0.49, "eval_accuracy": 0.7911646586345381, "eval_loss": 0.5225389003753662, "eval_runtime": 15.9954, "eval_samples_per_second": 155.67, "eval_steps_per_second": 9.753, "step": 12000 }, { "epoch": 0.51, "learning_rate": 3.7276931225554107e-06, "loss": 0.5344, "step": 12500 }, { "epoch": 0.53, "learning_rate": 3.6767641786179927e-06, "loss": 0.5495, "step": 13000 }, { "epoch": 0.55, "learning_rate": 3.6258352346805742e-06, "loss": 0.5362, "step": 13500 }, { "epoch": 0.57, "learning_rate": 3.5750081486310302e-06, "loss": 0.5308, "step": 14000 }, { "epoch": 0.57, "eval_accuracy": 0.7919678714859437, "eval_loss": 0.527259111404419, "eval_runtime": 16.0033, "eval_samples_per_second": 155.593, "eval_steps_per_second": 9.748, "step": 14000 }, { "epoch": 0.59, "learning_rate": 3.5240792046936118e-06, "loss": 0.5218, "step": 14500 }, { "epoch": 0.61, "learning_rate": 3.4731502607561934e-06, "loss": 0.5421, "step": 15000 }, { "epoch": 0.63, "learning_rate": 3.4222213168187745e-06, "loss": 0.5173, "step": 15500 }, { "epoch": 0.65, "learning_rate": 3.371292372881356e-06, "loss": 0.5246, "step": 16000 }, { "epoch": 0.65, "eval_accuracy": 0.7975903614457831, "eval_loss": 0.506671130657196, "eval_runtime": 15.9337, "eval_samples_per_second": 156.272, "eval_steps_per_second": 9.791, "step": 16000 }, { "epoch": 0.67, "learning_rate": 3.3204652868318125e-06, "loss": 0.5237, "step": 16500 }, { "epoch": 0.69, "learning_rate": 3.2695363428943936e-06, "loss": 0.5186, "step": 17000 }, { "epoch": 0.71, "learning_rate": 3.2186073989569756e-06, "loss": 0.5267, "step": 17500 }, { "epoch": 0.73, "learning_rate": 3.167678455019557e-06, "loss": 0.5075, "step": 18000 }, { "epoch": 0.73, "eval_accuracy": 0.7967871485943775, "eval_loss": 0.5081815719604492, "eval_runtime": 16.0036, "eval_samples_per_second": 155.59, "eval_steps_per_second": 9.748, "step": 18000 }, { "epoch": 0.75, "learning_rate": 3.1167495110821384e-06, "loss": 0.5029, "step": 18500 }, { "epoch": 0.77, "learning_rate": 3.06582056714472e-06, "loss": 0.5123, "step": 19000 }, { "epoch": 0.79, "learning_rate": 3.0148916232073015e-06, "loss": 0.5031, "step": 19500 }, { "epoch": 0.81, "learning_rate": 2.9639626792698826e-06, "loss": 0.5008, "step": 20000 }, { "epoch": 0.81, "eval_accuracy": 0.804417670682731, "eval_loss": 0.49554213881492615, "eval_runtime": 16.1075, "eval_samples_per_second": 154.587, "eval_steps_per_second": 9.685, "step": 20000 }, { "epoch": 0.84, "learning_rate": 2.9130337353324646e-06, "loss": 0.5054, "step": 20500 }, { "epoch": 0.86, "learning_rate": 2.862104791395046e-06, "loss": 0.4996, "step": 21000 }, { "epoch": 0.88, "learning_rate": 2.8111758474576274e-06, "loss": 0.4849, "step": 21500 }, { "epoch": 0.9, "learning_rate": 2.760246903520209e-06, "loss": 0.507, "step": 22000 }, { "epoch": 0.9, "eval_accuracy": 0.8152610441767069, "eval_loss": 0.47372516989707947, "eval_runtime": 16.3654, "eval_samples_per_second": 152.15, "eval_steps_per_second": 9.532, "step": 22000 }, { "epoch": 0.92, "learning_rate": 2.70931795958279e-06, "loss": 0.4995, "step": 22500 }, { "epoch": 0.94, "learning_rate": 2.6584908735332465e-06, "loss": 0.5007, "step": 23000 }, { "epoch": 0.96, "learning_rate": 2.607663787483703e-06, "loss": 0.4944, "step": 23500 }, { "epoch": 0.98, "learning_rate": 2.5567348435462845e-06, "loss": 0.5053, "step": 24000 }, { "epoch": 0.98, "eval_accuracy": 0.8072289156626506, "eval_loss": 0.49480196833610535, "eval_runtime": 16.2328, "eval_samples_per_second": 153.393, "eval_steps_per_second": 9.61, "step": 24000 }, { "epoch": 1.0, "learning_rate": 2.5058058996088656e-06, "loss": 0.4946, "step": 24500 }, { "epoch": 1.02, "learning_rate": 2.4548769556714476e-06, "loss": 0.4133, "step": 25000 }, { "epoch": 1.04, "learning_rate": 2.4039480117340288e-06, "loss": 0.4103, "step": 25500 }, { "epoch": 1.06, "learning_rate": 2.3530190677966103e-06, "loss": 0.4063, "step": 26000 }, { "epoch": 1.06, "eval_accuracy": 0.8040160642570281, "eval_loss": 0.4989866316318512, "eval_runtime": 15.9626, "eval_samples_per_second": 155.99, "eval_steps_per_second": 9.773, "step": 26000 }, { "epoch": 1.08, "learning_rate": 2.3021919817470667e-06, "loss": 0.4123, "step": 26500 }, { "epoch": 1.1, "learning_rate": 2.2512630378096483e-06, "loss": 0.3875, "step": 27000 }, { "epoch": 1.12, "learning_rate": 2.2003340938722295e-06, "loss": 0.4103, "step": 27500 }, { "epoch": 1.14, "learning_rate": 2.149405149934811e-06, "loss": 0.4061, "step": 28000 }, { "epoch": 1.14, "eval_accuracy": 0.797991967871486, "eval_loss": 0.5204265117645264, "eval_runtime": 16.59, "eval_samples_per_second": 150.09, "eval_steps_per_second": 9.403, "step": 28000 }, { "epoch": 1.16, "learning_rate": 2.0985780638852674e-06, "loss": 0.4178, "step": 28500 }, { "epoch": 1.18, "learning_rate": 2.047750977835724e-06, "loss": 0.4062, "step": 29000 }, { "epoch": 1.2, "learning_rate": 1.9968220338983054e-06, "loss": 0.4142, "step": 29500 }, { "epoch": 1.22, "learning_rate": 1.9458930899608866e-06, "loss": 0.4118, "step": 30000 }, { "epoch": 1.22, "eval_accuracy": 0.8120481927710843, "eval_loss": 0.5043097138404846, "eval_runtime": 15.9992, "eval_samples_per_second": 155.632, "eval_steps_per_second": 9.75, "step": 30000 }, { "epoch": 1.24, "learning_rate": 1.8949641460234683e-06, "loss": 0.4032, "step": 30500 }, { "epoch": 1.26, "learning_rate": 1.8440352020860497e-06, "loss": 0.4004, "step": 31000 }, { "epoch": 1.28, "learning_rate": 1.793106258148631e-06, "loss": 0.3999, "step": 31500 }, { "epoch": 1.3, "learning_rate": 1.7421773142112129e-06, "loss": 0.4057, "step": 32000 }, { "epoch": 1.3, "eval_accuracy": 0.8176706827309237, "eval_loss": 0.4870663285255432, "eval_runtime": 15.9746, "eval_samples_per_second": 155.872, "eval_steps_per_second": 9.765, "step": 32000 }, { "epoch": 1.32, "learning_rate": 1.691350228161669e-06, "loss": 0.4008, "step": 32500 }, { "epoch": 1.34, "learning_rate": 1.6404212842242504e-06, "loss": 0.4112, "step": 33000 }, { "epoch": 1.36, "learning_rate": 1.589492340286832e-06, "loss": 0.4101, "step": 33500 }, { "epoch": 1.39, "learning_rate": 1.5385633963494133e-06, "loss": 0.4108, "step": 34000 }, { "epoch": 1.39, "eval_accuracy": 0.8052208835341366, "eval_loss": 0.5033333897590637, "eval_runtime": 16.0841, "eval_samples_per_second": 154.811, "eval_steps_per_second": 9.699, "step": 34000 }, { "epoch": 1.41, "learning_rate": 1.487634452411995e-06, "loss": 0.3994, "step": 34500 }, { "epoch": 1.43, "learning_rate": 1.4367055084745765e-06, "loss": 0.4046, "step": 35000 }, { "epoch": 1.45, "learning_rate": 1.3857765645371578e-06, "loss": 0.3958, "step": 35500 }, { "epoch": 1.47, "learning_rate": 1.334949478487614e-06, "loss": 0.3927, "step": 36000 }, { "epoch": 1.47, "eval_accuracy": 0.8068273092369478, "eval_loss": 0.4963078498840332, "eval_runtime": 15.7763, "eval_samples_per_second": 157.832, "eval_steps_per_second": 9.888, "step": 36000 }, { "epoch": 1.49, "learning_rate": 1.2840205345501958e-06, "loss": 0.3971, "step": 36500 }, { "epoch": 1.51, "learning_rate": 1.2330915906127772e-06, "loss": 0.3985, "step": 37000 }, { "epoch": 1.53, "learning_rate": 1.1821626466753585e-06, "loss": 0.3922, "step": 37500 }, { "epoch": 1.55, "learning_rate": 1.1312337027379401e-06, "loss": 0.4077, "step": 38000 }, { "epoch": 1.55, "eval_accuracy": 0.814859437751004, "eval_loss": 0.47995567321777344, "eval_runtime": 15.901, "eval_samples_per_second": 156.594, "eval_steps_per_second": 9.811, "step": 38000 }, { "epoch": 1.57, "learning_rate": 1.0803047588005217e-06, "loss": 0.4048, "step": 38500 }, { "epoch": 1.59, "learning_rate": 1.029375814863103e-06, "loss": 0.3926, "step": 39000 }, { "epoch": 1.61, "learning_rate": 9.784468709256846e-07, "loss": 0.4051, "step": 39500 }, { "epoch": 1.63, "learning_rate": 9.27517926988266e-07, "loss": 0.4029, "step": 40000 }, { "epoch": 1.63, "eval_accuracy": 0.810843373493976, "eval_loss": 0.4851303994655609, "eval_runtime": 15.8122, "eval_samples_per_second": 157.474, "eval_steps_per_second": 9.866, "step": 40000 }, { "epoch": 1.65, "learning_rate": 8.765889830508476e-07, "loss": 0.385, "step": 40500 }, { "epoch": 1.67, "learning_rate": 8.256600391134289e-07, "loss": 0.4012, "step": 41000 }, { "epoch": 1.69, "learning_rate": 7.747310951760105e-07, "loss": 0.3998, "step": 41500 }, { "epoch": 1.71, "learning_rate": 7.239040091264668e-07, "loss": 0.3926, "step": 42000 }, { "epoch": 1.71, "eval_accuracy": 0.8068273092369478, "eval_loss": 0.49413371086120605, "eval_runtime": 16.1684, "eval_samples_per_second": 154.004, "eval_steps_per_second": 9.648, "step": 42000 }, { "epoch": 1.73, "learning_rate": 6.729750651890484e-07, "loss": 0.396, "step": 42500 }, { "epoch": 1.75, "learning_rate": 6.220461212516297e-07, "loss": 0.3896, "step": 43000 }, { "epoch": 1.77, "learning_rate": 5.711171773142113e-07, "loss": 0.3993, "step": 43500 }, { "epoch": 1.79, "learning_rate": 5.201882333767928e-07, "loss": 0.3895, "step": 44000 }, { "epoch": 1.79, "eval_accuracy": 0.8112449799196787, "eval_loss": 0.47773313522338867, "eval_runtime": 16.1399, "eval_samples_per_second": 154.276, "eval_steps_per_second": 9.665, "step": 44000 }, { "epoch": 1.81, "learning_rate": 4.694630052151239e-07, "loss": 0.3971, "step": 44500 }, { "epoch": 1.83, "learning_rate": 4.1853406127770535e-07, "loss": 0.3864, "step": 45000 }, { "epoch": 1.85, "learning_rate": 3.6770697522816176e-07, "loss": 0.3901, "step": 45500 }, { "epoch": 1.87, "learning_rate": 3.167780312907432e-07, "loss": 0.3806, "step": 46000 }, { "epoch": 1.87, "eval_accuracy": 0.8124497991967872, "eval_loss": 0.4864996075630188, "eval_runtime": 15.9489, "eval_samples_per_second": 156.123, "eval_steps_per_second": 9.781, "step": 46000 }, { "epoch": 1.89, "learning_rate": 2.6584908735332464e-07, "loss": 0.386, "step": 46500 }, { "epoch": 1.91, "learning_rate": 2.1492014341590616e-07, "loss": 0.3804, "step": 47000 }, { "epoch": 1.94, "learning_rate": 1.6409305736636246e-07, "loss": 0.3886, "step": 47500 }, { "epoch": 1.96, "learning_rate": 1.1316411342894395e-07, "loss": 0.3771, "step": 48000 }, { "epoch": 1.96, "eval_accuracy": 0.8156626506024096, "eval_loss": 0.4936372935771942, "eval_runtime": 16.2104, "eval_samples_per_second": 153.605, "eval_steps_per_second": 9.623, "step": 48000 }, { "epoch": 1.98, "learning_rate": 6.223516949152543e-08, "loss": 0.4011, "step": 48500 }, { "epoch": 2.0, "learning_rate": 1.130622555410691e-08, "loss": 0.394, "step": 49000 }, { "epoch": 2.0, "step": 49088, "total_flos": 2.2556354760831744e+16, "train_loss": 0.4839201732751133, "train_runtime": 15881.4239, "train_samples_per_second": 49.454, "train_steps_per_second": 3.091 } ], "max_steps": 49088, "num_train_epochs": 2, "total_flos": 2.2556354760831744e+16, "trial_name": null, "trial_params": null }