initial commit

Browse files

Files changed (8) hide show

README.md +19 -0
config.json +26 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
tokenizer.json +0 -0
tokenizer_config.json +1 -0
training.log +83 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+---
+language: en
+tags:
+- bert
+- qqp
+- glue
+- kd
+- torchdistill
+license: apache-2.0
+datasets:
+- qqp
+metrics:
+- f1
+- accuracy
+---
+`bert-base-uncased` fine-tuned on QQP dataset, using fine-tuned `bert-large-uncased` as a teacher model, [***torchdistill***](https://github.com/yoshitomo-matsubara/torchdistill) and [Google Colab](https://colab.research.google.com/github/yoshitomo-matsubara/torchdistill/blob/master/demo/glue_kd_and_submission.ipynb) for knowledge distillation.
+The training configuration (including hyperparameters) is available [here](https://github.com/yoshitomo-matsubara/torchdistill/blob/main/configs/sample/glue/qqp/kd/bert_base_uncased_from_bert_large_uncased.yaml).
+I submitted prediction files to [the GLUE leaderboard](https://gluebenchmark.com/leaderboard), and the overall GLUE score was **78.9**.

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "bert-base-uncased",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "finetuning_task": "qqp",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "transformers_version": "4.6.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f03bbdc2be5dccac85943bda86d7554e83ab88c087badebd1fa53898a846403
+size 438024457

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_lower": true, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased"}

training.log ADDED Viewed

	@@ -0,0 +1,83 @@

+2021-05-31 17:59:28,970	INFO	__main__	Namespace(adjust_lr=False, config='torchdistill/configs/sample/glue/qqp/kd/bert_base_uncased_from_bert_large_uncased.yaml', log='log/glue/qqp/kd/bert_base_uncased_from_bert_large_uncased.txt', private_output='leaderboard/glue/kd/bert_base_uncased_from_bert_large_uncased/', seed=None, student_only=False, task_name='qqp', test_only=False, world_size=1)
+2021-05-31 17:59:29,014	INFO	__main__	Distributed environment: NO
+Num processes: 1
+Process index: 0
+Local process index: 0
+Device: cuda
+Use FP16 precision: True
+2021-05-31 17:59:29,381	INFO	filelock	Lock 139678033932496 acquired on /root/.cache/huggingface/transformers/c6730a889404372cd78a39f068b75ca306635a3917e558492a19a0d45744a44d.f536da3d1200cfceae13ba24b76fd5963abfa15c8bb1c0f10dc5607f87d61f41.lock
+2021-05-31 17:59:29,730	INFO	filelock	Lock 139678033932496 released on /root/.cache/huggingface/transformers/c6730a889404372cd78a39f068b75ca306635a3917e558492a19a0d45744a44d.f536da3d1200cfceae13ba24b76fd5963abfa15c8bb1c0f10dc5607f87d61f41.lock
+2021-05-31 17:59:30,427	INFO	filelock	Lock 139678033886800 acquired on /root/.cache/huggingface/transformers/3ee6604aa6258f1b4373e9fcadfca3fefec09a87b2759cd50d9cb6933d94860b.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
+2021-05-31 17:59:30,938	INFO	filelock	Lock 139678033886800 released on /root/.cache/huggingface/transformers/3ee6604aa6258f1b4373e9fcadfca3fefec09a87b2759cd50d9cb6933d94860b.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
+2021-05-31 17:59:31,287	INFO	filelock	Lock 139678034384272 acquired on /root/.cache/huggingface/transformers/bbd1c7d170a568d80fe40c1392226fd011ecc648f9c223c0136c51ff0884c909.f471bd2d72c48b932f7be40446896b7e97c3be406ee93abfb500399bc606c829.lock
+2021-05-31 17:59:31,818	INFO	filelock	Lock 139678034384272 released on /root/.cache/huggingface/transformers/bbd1c7d170a568d80fe40c1392226fd011ecc648f9c223c0136c51ff0884c909.f471bd2d72c48b932f7be40446896b7e97c3be406ee93abfb500399bc606c829.lock
+2021-05-31 17:59:32,522	INFO	filelock	Lock 139678002442320 acquired on /root/.cache/huggingface/transformers/178af5611f560962f8aff2195e99d14fa27d8d1d34bf73a805dc36ea8f67051e.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
+2021-05-31 17:59:32,874	INFO	filelock	Lock 139678002442320 released on /root/.cache/huggingface/transformers/178af5611f560962f8aff2195e99d14fa27d8d1d34bf73a805dc36ea8f67051e.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
+2021-05-31 17:59:33,220	INFO	filelock	Lock 139678002031248 acquired on /root/.cache/huggingface/transformers/aed86e8623bdb9c9299ed2490a8251e31870ef9d5c7c357ea2f771e9d107de83.0f95f2171d2c33a9e9e088c1e5decb2dfb3a22fb00d904f96183827da9540426.lock
+2021-05-31 17:59:33,573	INFO	filelock	Lock 139678002031248 released on /root/.cache/huggingface/transformers/aed86e8623bdb9c9299ed2490a8251e31870ef9d5c7c357ea2f771e9d107de83.0f95f2171d2c33a9e9e088c1e5decb2dfb3a22fb00d904f96183827da9540426.lock
+2021-05-31 17:59:33,943	INFO	filelock	Lock 139678002442320 acquired on /root/.cache/huggingface/transformers/02b89a954d5801fe4697b7762202140277ff423972dcd3031ad702d2acdf53be.e1aab307d0e91d479c718b89c31b899cf5bb68246e073ef69be407f796cd3548.lock
+2021-05-31 18:00:27,978	INFO	filelock	Lock 139678002442320 released on /root/.cache/huggingface/transformers/02b89a954d5801fe4697b7762202140277ff423972dcd3031ad702d2acdf53be.e1aab307d0e91d479c718b89c31b899cf5bb68246e073ef69be407f796cd3548.lock
+2021-05-31 18:02:18,010	INFO	__main__	Start training
+2021-05-31 18:02:18,010	INFO	torchdistill.models.util	[teacher model]
+2021-05-31 18:02:18,011	INFO	torchdistill.models.util	Using the original teacher model
+2021-05-31 18:02:18,011	INFO	torchdistill.models.util	[student model]
+2021-05-31 18:02:18,011	INFO	torchdistill.models.util	Using the original student model
+2021-05-31 18:02:18,011	INFO	torchdistill.core.distillation	Loss = 1.0 * OrgLoss
+2021-05-31 18:02:18,011	INFO	torchdistill.core.distillation	Freezing the whole teacher model
+2021-05-31 18:02:22,568	INFO	torchdistill.misc.log	Epoch: [0]  [    0/11371]  eta: 0:21:22  lr: 9.999706856623575e-05  sample/s: 45.18920666045369  loss: 0.1681 (0.1681)  time: 0.1128  data: 0.0243  max mem: 2202
+2021-05-31 18:04:09,607	INFO	torchdistill.misc.log	Epoch: [0]  [ 1000/11371]  eta: 0:18:30  lr: 9.706563480198166e-05  sample/s: 41.91610890003173  loss: 0.0366 (0.0497)  time: 0.1076  data: 0.0043  max mem: 5127
+2021-05-31 18:05:56,745	INFO	torchdistill.misc.log	Epoch: [0]  [ 2000/11371]  eta: 0:16:43  lr: 9.413420103772755e-05  sample/s: 38.974362786548596  loss: 0.0280 (0.0401)  time: 0.1092  data: 0.0042  max mem: 5127
+2021-05-31 18:07:44,118	INFO	torchdistill.misc.log	Epoch: [0]  [ 3000/11371]  eta: 0:14:57  lr: 9.120276727347346e-05  sample/s: 37.538884252308534  loss: 0.0216 (0.0356)  time: 0.1021  data: 0.0039  max mem: 5127
+2021-05-31 18:09:30,635	INFO	torchdistill.misc.log	Epoch: [0]  [ 4000/11371]  eta: 0:13:08  lr: 8.827133350921937e-05  sample/s: 32.439047781094835  loss: 0.0228 (0.0329)  time: 0.1045  data: 0.0040  max mem: 5127
+2021-05-31 18:11:16,898	INFO	torchdistill.misc.log	Epoch: [0]  [ 5000/11371]  eta: 0:11:20  lr: 8.533989974496526e-05  sample/s: 37.61117313426144  loss: 0.0166 (0.0309)  time: 0.0997  data: 0.0040  max mem: 5127
+2021-05-31 18:13:03,625	INFO	torchdistill.misc.log	Epoch: [0]  [ 6000/11371]  eta: 0:09:33  lr: 8.240846598071117e-05  sample/s: 42.08845561414688  loss: 0.0197 (0.0293)  time: 0.1062  data: 0.0043  max mem: 5127
+2021-05-31 18:14:50,725	INFO	torchdistill.misc.log	Epoch: [0]  [ 7000/11371]  eta: 0:07:47  lr: 7.947703221645707e-05  sample/s: 39.292008206319615  loss: 0.0159 (0.0280)  time: 0.1041  data: 0.0041  max mem: 5127
+2021-05-31 18:16:38,156	INFO	torchdistill.misc.log	Epoch: [0]  [ 8000/11371]  eta: 0:06:00  lr: 7.654559845220297e-05  sample/s: 39.23522120830858  loss: 0.0158 (0.0269)  time: 0.1074  data: 0.0041  max mem: 5127
+2021-05-31 18:18:25,336	INFO	torchdistill.misc.log	Epoch: [0]  [ 9000/11371]  eta: 0:04:13  lr: 7.361416468794888e-05  sample/s: 44.30528714748609  loss: 0.0146 (0.0259)  time: 0.1048  data: 0.0043  max mem: 5127
+2021-05-31 18:20:12,518	INFO	torchdistill.misc.log	Epoch: [0]  [10000/11371]  eta: 0:02:26  lr: 7.068273092369478e-05  sample/s: 39.33696600234467  loss: 0.0163 (0.0251)  time: 0.1063  data: 0.0042  max mem: 5127
+2021-05-31 18:21:59,233	INFO	torchdistill.misc.log	Epoch: [0]  [11000/11371]  eta: 0:00:39  lr: 6.775129715944069e-05  sample/s: 39.52323022921624  loss: 0.0177 (0.0244)  time: 0.1096  data: 0.0041  max mem: 5127
+2021-05-31 18:22:38,741	INFO	torchdistill.misc.log	Epoch: [0] Total time: 0:20:16
+2021-05-31 18:23:05,411	INFO	/usr/local/lib/python3.7/dist-packages/datasets/metric.py	Removing /root/.cache/huggingface/metrics/glue/qqp/default_experiment-1-0.arrow
+2021-05-31 18:23:05,413	INFO	__main__	Validation: accuracy = 0.9027702201335642, f1 = 0.867495870832912
+2021-05-31 18:23:05,413	INFO	__main__	Updating ckpt at ./resource/ckpt/glue/qqp/kd/qqp-bert-base-uncased_from_bert-large-uncased
+2021-05-31 18:23:06,558	INFO	torchdistill.misc.log	Epoch: [1]  [    0/11371]  eta: 0:25:33  lr: 6.666373523290241e-05  sample/s: 37.026152128143224  loss: 0.0117 (0.0117)  time: 0.1349  data: 0.0268  max mem: 5127
+2021-05-31 18:24:52,990	INFO	torchdistill.misc.log	Epoch: [1]  [ 1000/11371]  eta: 0:18:24  lr: 6.373230146864831e-05  sample/s: 41.81248052954649  loss: 0.0090 (0.0093)  time: 0.1070  data: 0.0041  max mem: 5127
+2021-05-31 18:26:39,130	INFO	torchdistill.misc.log	Epoch: [1]  [ 2000/11371]  eta: 0:16:36  lr: 6.0800867704394224e-05  sample/s: 38.83039549696341  loss: 0.0071 (0.0092)  time: 0.1047  data: 0.0042  max mem: 5127
+2021-05-31 18:28:26,200	INFO	torchdistill.misc.log	Epoch: [1]  [ 3000/11371]  eta: 0:14:51  lr: 5.7869433940140126e-05  sample/s: 39.90328388595947  loss: 0.0067 (0.0091)  time: 0.1078  data: 0.0043  max mem: 5127
+2021-05-31 18:30:13,319	INFO	torchdistill.misc.log	Epoch: [1]  [ 4000/11371]  eta: 0:13:06  lr: 5.493800017588603e-05  sample/s: 44.45049226888797  loss: 0.0063 (0.0091)  time: 0.1123  data: 0.0044  max mem: 5127
+2021-05-31 18:32:00,343	INFO	torchdistill.misc.log	Epoch: [1]  [ 5000/11371]  eta: 0:11:20  lr: 5.200656641163193e-05  sample/s: 42.26201253964295  loss: 0.0059 (0.0091)  time: 0.1038  data: 0.0041  max mem: 5127
+2021-05-31 18:33:46,858	INFO	torchdistill.misc.log	Epoch: [1]  [ 6000/11371]  eta: 0:09:33  lr: 4.907513264737784e-05  sample/s: 49.07555906571307  loss: 0.0106 (0.0090)  time: 0.1070  data: 0.0041  max mem: 5127
+2021-05-31 18:35:33,635	INFO	torchdistill.misc.log	Epoch: [1]  [ 7000/11371]  eta: 0:07:46  lr: 4.614369888312374e-05  sample/s: 25.658453477542068  loss: 0.0060 (0.0090)  time: 0.1099  data: 0.0043  max mem: 5127
+2021-05-31 18:37:20,235	INFO	torchdistill.misc.log	Epoch: [1]  [ 8000/11371]  eta: 0:05:59  lr: 4.3212265118869647e-05  sample/s: 39.91771493831713  loss: 0.0083 (0.0089)  time: 0.1023  data: 0.0043  max mem: 5127
+2021-05-31 18:39:05,960	INFO	torchdistill.misc.log	Epoch: [1]  [ 9000/11371]  eta: 0:04:12  lr: 4.028083135461555e-05  sample/s: 38.71909458904331  loss: 0.0066 (0.0088)  time: 0.1048  data: 0.0040  max mem: 5127
+2021-05-31 18:40:52,955	INFO	torchdistill.misc.log	Epoch: [1]  [10000/11371]  eta: 0:02:26  lr: 3.734939759036144e-05  sample/s: 42.62135892732843  loss: 0.0072 (0.0087)  time: 0.1054  data: 0.0041  max mem: 5127
+2021-05-31 18:42:40,041	INFO	torchdistill.misc.log	Epoch: [1]  [11000/11371]  eta: 0:00:39  lr: 3.441796382610735e-05  sample/s: 37.30525117404298  loss: 0.0061 (0.0087)  time: 0.1100  data: 0.0050  max mem: 5127
+2021-05-31 18:43:19,562	INFO	torchdistill.misc.log	Epoch: [1] Total time: 0:20:13
+2021-05-31 18:43:46,461	INFO	/usr/local/lib/python3.7/dist-packages/datasets/metric.py	Removing /root/.cache/huggingface/metrics/glue/qqp/default_experiment-1-0.arrow
+2021-05-31 18:43:46,463	INFO	__main__	Validation: accuracy = 0.9109572099925798, f1 = 0.8809602539514583
+2021-05-31 18:43:46,463	INFO	__main__	Updating ckpt at ./resource/ckpt/glue/qqp/kd/qqp-bert-base-uncased_from_bert-large-uncased
+2021-05-31 18:43:47,833	INFO	torchdistill.misc.log	Epoch: [2]  [    0/11371]  eta: 0:27:13  lr: 3.333040189956908e-05  sample/s: 38.18740425958087  loss: 0.0009 (0.0009)  time: 0.1437  data: 0.0389  max mem: 5127
+2021-05-31 18:45:35,068	INFO	torchdistill.misc.log	Epoch: [2]  [ 1000/11371]  eta: 0:18:32  lr: 3.039896813531498e-05  sample/s: 47.289335866372774  loss: 0.0040 (0.0037)  time: 0.1057  data: 0.0043  max mem: 5127
+2021-05-31 18:47:22,539	INFO	torchdistill.misc.log	Epoch: [2]  [ 2000/11371]  eta: 0:16:46  lr: 2.746753437106089e-05  sample/s: 39.70976293266682  loss: 0.0025 (0.0037)  time: 0.1010  data: 0.0043  max mem: 5127
+2021-05-31 18:49:09,186	INFO	torchdistill.misc.log	Epoch: [2]  [ 3000/11371]  eta: 0:14:56  lr: 2.453610060680679e-05  sample/s: 48.54841612723069  loss: 0.0034 (0.0037)  time: 0.1017  data: 0.0040  max mem: 5127
+2021-05-31 18:50:56,338	INFO	torchdistill.misc.log	Epoch: [2]  [ 4000/11371]  eta: 0:13:09  lr: 2.1604666842552692e-05  sample/s: 33.26529009846415  loss: 0.0033 (0.0037)  time: 0.1129  data: 0.0041  max mem: 5127
+2021-05-31 18:52:42,168	INFO	torchdistill.misc.log	Epoch: [2]  [ 5000/11371]  eta: 0:11:20  lr: 1.8673233078298597e-05  sample/s: 45.14567412223107  loss: 0.0034 (0.0037)  time: 0.1075  data: 0.0041  max mem: 5127
+2021-05-31 18:54:28,944	INFO	torchdistill.misc.log	Epoch: [2]  [ 6000/11371]  eta: 0:09:33  lr: 1.5741799314044502e-05  sample/s: 37.497018514710746  loss: 0.0024 (0.0036)  time: 0.1027  data: 0.0041  max mem: 5127
+2021-05-31 18:56:15,863	INFO	torchdistill.misc.log	Epoch: [2]  [ 7000/11371]  eta: 0:07:47  lr: 1.2810365549790403e-05  sample/s: 45.71261665054562  loss: 0.0022 (0.0036)  time: 0.1131  data: 0.0045  max mem: 5127
+2021-05-31 18:58:02,357	INFO	torchdistill.misc.log	Epoch: [2]  [ 8000/11371]  eta: 0:06:00  lr: 9.878931785536307e-06  sample/s: 41.93528197283503  loss: 0.0032 (0.0036)  time: 0.1125  data: 0.0043  max mem: 5127
+2021-05-31 18:59:48,714	INFO	torchdistill.misc.log	Epoch: [2]  [ 9000/11371]  eta: 0:04:13  lr: 6.947498021282209e-06  sample/s: 42.596145399335306  loss: 0.0031 (0.0036)  time: 0.1042  data: 0.0040  max mem: 5127
+2021-05-31 19:01:34,858	INFO	torchdistill.misc.log	Epoch: [2]  [10000/11371]  eta: 0:02:26  lr: 4.016064257028113e-06  sample/s: 39.73628539082077  loss: 0.0021 (0.0035)  time: 0.1089  data: 0.0043  max mem: 5127
+2021-05-31 19:03:21,986	INFO	torchdistill.misc.log	Epoch: [2]  [11000/11371]  eta: 0:00:39  lr: 1.084630492774016e-06  sample/s: 39.76435119786877  loss: 0.0021 (0.0035)  time: 0.1051  data: 0.0041  max mem: 5127
+2021-05-31 19:04:00,922	INFO	torchdistill.misc.log	Epoch: [2] Total time: 0:20:13
+2021-05-31 19:04:27,632	INFO	/usr/local/lib/python3.7/dist-packages/datasets/metric.py	Removing /root/.cache/huggingface/metrics/glue/qqp/default_experiment-1-0.arrow
+2021-05-31 19:04:27,634	INFO	__main__	Validation: accuracy = 0.9132327479594361, f1 = 0.8830510734764635
+2021-05-31 19:04:27,634	INFO	__main__	Updating ckpt at ./resource/ckpt/glue/qqp/kd/qqp-bert-base-uncased_from_bert-large-uncased
+2021-05-31 19:04:28,844	INFO	__main__	[Teacher: bert-large-uncased]
+2021-05-31 19:05:13,867	INFO	/usr/local/lib/python3.7/dist-packages/datasets/metric.py	Removing /root/.cache/huggingface/metrics/glue/qqp/default_experiment-1-0.arrow
+2021-05-31 19:05:13,869	INFO	__main__	Test: accuracy = 0.9108088053425674, f1 = 0.8808564065287783
+2021-05-31 19:05:17,512	INFO	__main__	[Student: bert-base-uncased]
+2021-05-31 19:05:44,133	INFO	/usr/local/lib/python3.7/dist-packages/datasets/metric.py	Removing /root/.cache/huggingface/metrics/glue/qqp/default_experiment-1-0.arrow
+2021-05-31 19:05:44,134	INFO	__main__	Test: accuracy = 0.9132327479594361, f1 = 0.8830510734764635
+2021-05-31 19:05:44,134	INFO	__main__	Start prediction for private dataset(s)
+2021-05-31 19:05:44,135	INFO	__main__	qqp/test: 390965 samples

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff