Training in progress, epoch 1

Browse files

Files changed (10) hide show

added_tokens.json +7 -0
config.json +36 -0
pytorch_model.bin +3 -0
special_tokens_map.json +9 -0
spiece.model +3 -0
tokenizer_config.json +76 -0
train-v1.1.json +0 -0
train_factual_consistency.ipynb +212 -0
training_args.bin +3 -0
utils.py +123 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "<pad>": 0,
+  "<unk>": 1,
+  "[CLS]": 2,
+  "[MASK]": 4,
+  "[SEP]": 3
+}

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_name_or_path": "line-corporation/line-distilbert-base-japanese",
+  "activation": "gelu",
+  "architectures": [
+    "ConsistentSentenceClassifier"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "contradiction",
+    "1": "neutral",
+    "2": "entailment"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "contradiction": 0,
+    "entailment": 2,
+    "neutral": 1
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_hidden_states": true,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": true,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.0",
+  "vocab_size": 32768
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0882f733fa2213d98351829cca6db388f9664784abc99d90cb70d48beaaf16e6
+size 274758317

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "pad_token": "<pad>",
+  "sep_token": "[SEP]",
+  "unk_token": "<unk>"
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcfafc8c0662d9c8f39621a64c74260f2ad120310c8dd24886de2dddaf599b4e
+size 439391

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "auto_map": {
+    "AutoTokenizer": [
+      "line-corporation/line-distilbert-base-japanese--distilbert_japanese_tokenizer.DistilBertJapaneseTokenizer",
+      null
+    ]
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "do_subword_tokenize": true,
+  "do_word_tokenize": true,
+  "eos_token": "[SEP]",
+  "jumanpp_kwargs": null,
+  "keep_accents": true,
+  "mask_token": "[MASK]",
+  "mecab_kwargs": {
+    "mecab_dic": "unidic_lite"
+  },
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "<pad>",
+  "remove_space": true,
+  "sep_token": "[SEP]",
+  "subword_tokenizer_type": "sentencepiece",
+  "sudachi_kwargs": null,
+  "tokenize_chinese_chars": false,
+  "tokenizer_class": "BertJapaneseTokenizer",
+  "tokenizer_file": null,
+  "unk_token": "<unk>",
+  "word_tokenizer_type": "mecab"
+}

train-v1.1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

train_factual_consistency.ipynb ADDED Viewed

	@@ -0,0 +1,212 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b12ae8a3-9e08-402c-894c-31697fad6c56",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "54d7e7ee895949c4a025acf2c9640f96",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "160c80c1-0ca4-45df-8171-87cd3c88a223",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from transformers import (\n",
+    "    AutoTokenizer,\n",
+    "    DataCollatorWithPadding,\n",
+    "    Trainer,\n",
+    "    TrainingArguments,\n",
+    ")\n",
+    "from utils import ConsistentSentenceClassifier, get_metrics, load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "25800588-5d42-4524-9dc6-a6a0c180b8b0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                                  text  label\n",
+      "512  カーキ色の服を着た男性が、口元にリンゴを当てています。[SEP]カーキ色の服を着た男性が、口...      0\n",
+      "513    男性がグラウンドでボールを投げています。[SEP]白い髯を生やした男性がボールを投げています。      1\n",
+      "514  椅子に座った子供が、手づかみで食事をしています。[SEP]椅子に座った子供が手づかみで、食事...      2\n",
+      "515         プロペラ機が何台も駐機しています。[SEP]プロペラ機が何台も連なって飛んでいます。      0\n",
+      "516  消火栓から水が勢いよく噴き出しています。[SEP]水が噴き出している消火栓の水を浴びるように...      1\n",
+      "517  冷蔵庫のないキッチンにナイフとフォークが置かれています。[SEP]冷蔵庫の置かれたキッチンに...      0\n",
+      "518  うみでサーフィンをしているひとがいます。[SEP]黒いウェットスーツを着た人がサーフボードに...      1\n",
+      "519             池から白い鳥が飛び立っています。[SEP]森にある水の上を鳥が飛んでいます。      1\n",
+      "520       丈夫なビーチパラソルが立っています。[SEP]ビーチパラソルの支柱が折れ曲がっています。      0\n",
+      "521  白髪の男性が少女から花束を受け取っています。[SEP]花束を持った男性の前に多くの子供たちが...      1\n",
+      "                                                text  label\n",
+      "0    赤いひとつの傘に、二人の人が入っています。[SEP]歩道を歩く通行人が傘をさして歩いています。      1\n",
+      "1              川を小さなボートが進んで行きます。[SEP]川を豪華客船が進んでいきます。      0\n",
+      "2  ゲレンデのこぶでスキージャンプしています。[SEP]雪上でモーグルを楽しむ水色のウェアを着た女性。      1\n",
+      "3       黒いお皿に乗っているピザをカットしています。[SEP]黒い皿の上にピザが盛られています。      2\n",
+      "4    女性が目を細めて携帯電話で話をしています。[SEP]目を細めた女性が携帯電話で話をしています。      2\n",
+      "5  バナナやパパイヤなどの果物が売られている。[SEP]台の上にはバナナなどの青果が並べられています。      1\n",
+      "6  ヘッドライトを点灯させた白いバスが駐車場に止まっています。[SEP]ライトを点灯させているバ...      2\n",
+      "7  水面の上に、カイトサーフィンの凧が揚がっています。[SEP]海の上に水上スポーツ用の凧が揚が...      1\n",
+      "8        ホットドッグを野外で食べている人たちです。[SEP]家の中でホットドッグを食べている。      0\n",
+      "9  草が生い茂っている所に、3頭のゾウがいます。[SEP]草むらの中に三頭のゾウが立っているとこ...      1\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "014ef81bb16c41a383f86e2ddc5ca383",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/19561 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.\n",
+      "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7024f08501734cd188d3b8c6dc8495eb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/512 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(\"line-corporation/line-distilbert-base-japanese\")\n",
+    "dataset = load_dataset('train-v1.1.json')\n",
+    "tokenized_dataset = dataset.map(\n",
+    "    lambda examples: tokenizer(examples[\"text\"], padding='max_length', truncation=True), batched=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6bc83d4c-378c-4313-b641-8ead0c02f715",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:XRT configuration not detected. Defaulting to preview PJRT runtime. To silence this warning and continue using PJRT, explicitly set PJRT_DEVICE to a supported device or configure XRT. To disable default device selection, set PJRT_SELECT_DEFAULT_DEVICE=0\n",
+      "WARNING:root:For more information about the status of PJRT, see https://github.com/pytorch/xla/blob/master/docs/pjrt.md\n",
+      "WARNING:root:Defaulting to PJRT_DEVICE=CPU\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = ConsistentSentenceClassifier(\n",
+    "    freeze_bert=True)\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"../factual-consistency-classification-ja-avgpool\",\n",
+    "    learning_rate=1e-4,\n",
+    "    per_device_train_batch_size=64,\n",
+    "    per_device_eval_batch_size=8,\n",
+    "    num_train_epochs=30,\n",
+    "    weight_decay=0.02,\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    eval_accumulation_steps=4,\n",
+    "    save_strategy=\"epoch\",\n",
+    "    load_best_model_at_end=True,\n",
+    "    save_total_limit=5,\n",
+    "    push_to_hub=True,\n",
+    ")\n",
+    "\n",
+    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_dataset[\"train\"],\n",
+    "    eval_dataset=tokenized_dataset[\"test\"],\n",
+    "    tokenizer=tokenizer,\n",
+    "    data_collator=data_collator,\n",
+    "    compute_metrics=get_metrics(),\n",
+    ")\n",
+    "\n",
+    "trainer.train()\n",
+    "trainer.push_to_hub('factual-consistency-classification-ja-avgpool')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6eb93f7-5a38-49a2-be0d-e42267e23a0a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "environment": {
+   "kernel": "python3",
+   "name": "pytorch-gpu.2-0.m112",
+   "type": "gcloud",
+   "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.2-0:m112"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85ab236f32cbef6bf6bf1c471ec41a7226363e3945c5bf62fbf3728eca74dee1
+size 4155

utils.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import json
+import pandas as pd
+import datasets
+import numpy as np
+import evaluate
+import torch
+from transformers import AutoModel, DistilBertForSequenceClassification
+from transformers.modeling_outputs import SequenceClassifierOutput
+from typing import Optional
+SEP_TOKEN = '[SEP]'
+LABEL2ID = {'entailment': 2, 'neutral': 1, 'contradiction': 0}
+ID2LABEL = {2: 'entailment', 1: 'neutral', 0: 'contradiction'}
+def format_dataset(arr):
+    text = [el['sentence1'] + SEP_TOKEN + el['sentence2'] for el in arr]
+    label = [LABEL2ID[el['label']] for el in arr]
+    new_df = pd.DataFrame({'text': text, 'label': label})
+    return new_df.sample(frac=1, random_state=42).reset_index(drop=True)
+# Load dataset
+def load_dataset(path):
+    train_array = []
+    with open(path) as f:
+        for line in f.readlines():
+            if line:
+                train_array.append(json.loads(line))
+    df = format_dataset(train_array)
+    # Split dataset into train and val
+    df_train = df.iloc[512:, :]
+    # We do not need much test data
+    df_test = df.iloc[:512, :]
+    print(df_train[:10])
+    print(df_test[:10])
+    factual_consistency_dataset = datasets.dataset_dict.DatasetDict()
+    factual_consistency_dataset["train"] = datasets.dataset_dict.Dataset.from_pandas(
+        df_train[["text", "label"]])
+    factual_consistency_dataset["test"] = datasets.dataset_dict.Dataset.from_pandas(
+        df_test[["text", "label"]])
+    return factual_consistency_dataset
+class ConsistentSentenceClassifier(DistilBertForSequenceClassification):
+    def __init__(self, freeze_bert=True):
+        base_model = AutoModel.from_pretrained(
+            'line-corporation/line-distilbert-base-japanese', num_labels=3)
+        config = base_model.config
+        super(ConsistentSentenceClassifier, self).__init__(config=config)
+        config.num_labels = 3
+        config.id2label = ID2LABEL
+        config.label2id = LABEL2ID
+        config.problem_type = "single_label_classification"
+        self.distilbert = base_model
+        if not freeze_bert:
+            return
+        for param in self.distilbert.parameters():
+            param.requires_grad = False
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
+        pooled_output = torch.mean(hidden_state, dim=1)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
+        pooled_output = torch.nn.ReLU()(pooled_output)  # (bs, dim)
+        pooled_output = self.dropout(pooled_output)  # (bs, dim)
+        logits = self.classifier(pooled_output)  # (bs, num_labels)
+        loss = None
+        if labels is not None:
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + distilbert_output[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=distilbert_output.hidden_states,
+            attentions=distilbert_output.attentions,
+        )
+# Set up evaluation metridef get_metrics():
+def get_metrics():
+    metric = evaluate.load("accuracy")
+    def compute_metrics(eval_pred):
+        predictions, labels = eval_pred
+        preds = predictions[0].argmax(axis=1)
+        return metric.compute(predictions=preds, references=labels)
+    return compute_metrics