feat: push custom model

Browse files

Files changed (10) hide show

README.md +5 -5
config.json +4 -4
configuration_bert.py +16 -25
model.safetensors +2 -2
modeling_bert.py +89 -44
special_tokens_map.json +6 -20
tokenizer.json +0 -0
tokenizer_config.json +22 -22
training_args.bin +1 -1
vocab.txt +0 -0

README.md CHANGED Viewed

@@ -12,14 +12,14 @@ tags:
 - sentence-similarity
 - mteb
 - Ubuntu
-- Linux
-- Software
-- OperatingSystem
 - Technical
 ---
-This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-code**](https://huggingface.co/jinaai/jina-embeddings-v2-base-code) designed for the following use case:
-technical support search for Ubuntu
 ## How to Use
 This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:

 - sentence-similarity
 - mteb
 - Ubuntu
 - Technical
+- Support
+- Linux
+- Community
 ---
+This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-en**](https://huggingface.co/jinaai/jina-embeddings-v2-base-en) designed for the following use case:
+technical support for Ubuntu
 ## How to Use
 This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:

config.json CHANGED Viewed

@@ -8,15 +8,15 @@
   "auto_map": {
     "AutoConfig": "configuration_bert.JinaBertConfig",
     "AutoModel": "modeling_bert.JinaBertModel",
-    "AutoModelForMaskedLM": "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertForMaskedLM",
-    "AutoModelForSequenceClassification": "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertForSequenceClassification"
   },
   "classifier_dropout": null,
   "emb_pooler": "mean",
   "feed_forward_type": "geglu",
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.0,
   "hidden_size": 768,
   "initializer_range": 0.02,
   "intermediate_size": 3072,
@@ -32,5 +32,5 @@
   "transformers_version": "4.40.2",
   "type_vocab_size": 2,
   "use_cache": true,
-  "vocab_size": 61056
 }

   "auto_map": {
     "AutoConfig": "configuration_bert.JinaBertConfig",
     "AutoModel": "modeling_bert.JinaBertModel",
+    "AutoModelForMaskedLM": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForMaskedLM",
+    "AutoModelForSequenceClassification": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForSequenceClassification"
   },
   "classifier_dropout": null,
   "emb_pooler": "mean",
   "feed_forward_type": "geglu",
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "initializer_range": 0.02,
   "intermediate_size": 3072,
   "transformers_version": "4.40.2",
   "type_vocab_size": 2,
   "use_cache": true,
+  "vocab_size": 30528
 }

configuration_bert.py CHANGED Viewed

@@ -17,18 +17,11 @@
 """ BERT model configuration"""
 from collections import OrderedDict
 from typing import Mapping
-import warnings
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
-try:
-    from optimum.exporters.onnx.model_configs import BertOnnxConfig
-    OPTIMUM_INSTALLED = True
-except ImportError:
-    warnings.warn("optimum is not installed. To use OnnxConfig and BertOnnxConfig, make sure that `optimum` package is installed")
-    OPTIMUM_INSTALLED = False
 logger = logging.get_logger(__name__)
@@ -135,7 +128,7 @@ class JinaBertConfig(PretrainedConfig):
         classifier_dropout=None,
         feed_forward_type="original",
         emb_pooler=None,
-        attn_implementation=None,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -159,19 +152,17 @@ class JinaBertConfig(PretrainedConfig):
         self.emb_pooler = emb_pooler
         self.attn_implementation = attn_implementation
-if OPTIMUM_INSTALLED:
-    class JinaBertOnnxConfig(BertOnnxConfig):
-        @property
-        def inputs(self) -> Mapping[str, Mapping[int, str]]:
-            if self.task == "multiple-choice":
-                dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
-            else:
-                dynamic_axis = {0: "batch", 1: "sequence"}
-            return OrderedDict(
-                [
-                    ("input_ids", dynamic_axis),
-                    ("attention_mask", dynamic_axis),
-                ]
-            )

 """ BERT model configuration"""
 from collections import OrderedDict
 from typing import Mapping
 from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
         classifier_dropout=None,
         feed_forward_type="original",
         emb_pooler=None,
+        attn_implementation='torch',
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
         self.emb_pooler = emb_pooler
         self.attn_implementation = attn_implementation
+class JinaBertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+            ]
+        )

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c9ced9b0b4c8eeee5d75c690d9f5aaa4272b844efa8b9805eed9737badda8d9
-size 643505600

 version https://git-lfs.github.com/spec/v1
+oid sha256:e5cbde1a065989fc5e605ac6d44f15ee212a5bbe0e7af7c9a3a045d1ada6de5f
+size 549493968

modeling_bert.py CHANGED Viewed

@@ -280,10 +280,9 @@ class JinaBertSelfAttention(nn.Module):
         self.query = nn.Linear(config.hidden_size, self.all_head_size)
         self.key = nn.Linear(config.hidden_size, self.all_head_size)
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
-        self.layer_norm_q = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.layer_norm_k = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.position_embedding_type = position_embedding_type or getattr(
             config, "position_embedding_type", "absolute"
         )
@@ -317,7 +316,7 @@ class JinaBertSelfAttention(nn.Module):
         output_attentions: Optional[bool] = False,
         bias: Optional[torch.FloatTensor] = None,
     ) -> Tuple[torch.Tensor]:
-        mixed_query_layer = self.layer_norm_q(self.query(hidden_states))
         # If this is instantiated as a cross-attention module, the keys
         # and values come from an encoder; the attention mask needs to be
@@ -330,16 +329,16 @@ class JinaBertSelfAttention(nn.Module):
             value_layer = past_key_value[1]
             attention_mask = encoder_attention_mask
         elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(encoder_hidden_states)))
             value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
             attention_mask = encoder_attention_mask
         elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(hidden_states)))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
             key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
             value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
         else:
-            key_layer = self.transpose_for_scores(self.layer_norm_k(self.key(hidden_states)))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
         query_layer = self.transpose_for_scores(mixed_query_layer)
@@ -358,7 +357,8 @@ class JinaBertSelfAttention(nn.Module):
         if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
             b, _, s, _ = query_layer.shape
             new_bias = attention_mask + bias
-            attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias)
             attn = attn.permute(0, 2, 1, 3).contiguous()
             return (attn.view(b, s, self.all_head_size),)
@@ -431,7 +431,7 @@ class JinaBertSelfAttention(nn.Module):
         context_layer = context_layer.view(new_context_layer_shape)
         outputs = (
-            (context_layer, attention_scores) if output_attentions else (context_layer,)
         )
         if self.is_decoder:
@@ -515,29 +515,44 @@ class JinaBertAttention(nn.Module):
         return outputs
-class JinaBertMLP(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
-        self.config = config
-        self.act = ACT2FN[config.hidden_act]
-        self.up_layer = nn.Linear(
-            config.hidden_size, config.intermediate_size, bias=False
-        )
-        self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # Up
-        hidden_mlp_states = self.act(self.up_layer(hidden_states))
-        hidden_mlp_states = self.dropout(hidden_mlp_states)
-        # Down
-        return self.down_layer(hidden_mlp_states)
 class JinaBertGLUMLP(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.config = config
         if config.feed_forward_type == 'reglu':
             self.act = nn.ReLU()
         elif config.feed_forward_type == 'geglu':
@@ -546,21 +561,23 @@ class JinaBertGLUMLP(nn.Module):
             raise ValueError(
                 f"feed_forward_type {config.feed_forward_type} not supported"
             )
-        self.up_gated_layer = nn.Linear(
-            config.hidden_size, config.intermediate_size * 2, bias=False
-        )
-        self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # Up with gate
-        hidden_mlp_states = self.up_gated_layer(hidden_states)
-        up = hidden_mlp_states[:, :, :self.config.intermediate_size]
-        gated = hidden_mlp_states[:, :, self.config.intermediate_size:]
-        hidden_mlp_states = up * self.act(gated)
-        hidden_mlp_states = self.dropout(hidden_mlp_states)
-        # Down
-        return self.down_layer(hidden_mlp_states)
 class JinaBertLayer(nn.Module):
@@ -572,8 +589,6 @@ class JinaBertLayer(nn.Module):
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         self.feed_forward_type = config.feed_forward_type
-        self.layer_norm_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.layer_norm_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         if self.add_cross_attention:
             if not self.is_decoder:
                 raise ValueError(
@@ -585,7 +600,8 @@ class JinaBertLayer(nn.Module):
         if self.feed_forward_type.endswith('glu'):
             self.mlp = JinaBertGLUMLP(config)
         else:
-            self.mlp = JinaBertMLP(config)
     def forward(
         self,
@@ -598,9 +614,6 @@ class JinaBertLayer(nn.Module):
         past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-        # Pre-Norm
-        residual = hidden_states
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = (
             past_key_value[:2] if past_key_value is not None else None
@@ -654,9 +667,15 @@ class JinaBertLayer(nn.Module):
             cross_attn_present_key_value = cross_attention_outputs[-1]
             present_key_value = present_key_value + cross_attn_present_key_value
-        residual = self.layer_norm_1(residual + attention_output)
-        mlp_output = self.mlp(residual)
-        layer_output = self.layer_norm_2(residual + mlp_output)
         outputs = (layer_output,) + outputs
         # if decoder, return the attn key/values as the last output
@@ -665,6 +684,11 @@ class JinaBertLayer(nn.Module):
         return outputs
 class JinaBertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
@@ -675,6 +699,11 @@ class JinaBertEncoder(nn.Module):
         )
         self.gradient_checkpointing = False
         self.num_attention_heads = config.num_attention_heads
     def rebuild_alibi_tensor(
         self, size: int, device: Optional[Union[torch.device, str]] = None
@@ -742,7 +771,23 @@ class JinaBertEncoder(nn.Module):
         # Add alibi matrix to extended_attention_mask
         _, seqlen, _ = hidden_states.size()
-        alibi_bias = self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device).to(hidden_states.dtype)
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(

         self.query = nn.Linear(config.hidden_size, self.all_head_size)
         self.key = nn.Linear(config.hidden_size, self.all_head_size)
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout_p = config.attention_probs_dropout_prob
+        self.dropout = nn.Dropout(self.dropout_p)
         self.position_embedding_type = position_embedding_type or getattr(
             config, "position_embedding_type", "absolute"
         )
         output_attentions: Optional[bool] = False,
         bias: Optional[torch.FloatTensor] = None,
     ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
         # If this is instantiated as a cross-attention module, the keys
         # and values come from an encoder; the attention mask needs to be
             value_layer = past_key_value[1]
             attention_mask = encoder_attention_mask
         elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
             value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
             attention_mask = encoder_attention_mask
         elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
             key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
             value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
         else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
             value_layer = self.transpose_for_scores(self.value(hidden_states))
         query_layer = self.transpose_for_scores(mixed_query_layer)
         if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
             b, _, s, _ = query_layer.shape
             new_bias = attention_mask + bias
+            dropout_p = self.dropout_p if self.training else 0.0
+            attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias, dropout_p=dropout_p)
             attn = attn.permute(0, 2, 1, 3).contiguous()
             return (attn.view(b, s, self.all_head_size),)
         context_layer = context_layer.view(new_context_layer_shape)
         outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
         )
         if self.is_decoder:
         return outputs
+class JinaBertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class JinaBertOutput(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
 class JinaBertGLUMLP(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.config = config
+        self.gated_layers = nn.Linear(
+            config.hidden_size, config.intermediate_size * 2, bias=False
+        )
         if config.feed_forward_type == 'reglu':
             self.act = nn.ReLU()
         elif config.feed_forward_type == 'geglu':
             raise ValueError(
                 f"feed_forward_type {config.feed_forward_type} not supported"
             )
+        self.wo = nn.Linear(config.intermediate_size, config.hidden_size)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual_connection = hidden_states
+        # compute the activation
+        hidden_states = self.gated_layers(hidden_states)
+        gated = hidden_states[:, :, : self.config.intermediate_size]
+        non_gated = hidden_states[:, :, self.config.intermediate_size :]
+        hidden_states = self.act(gated) * non_gated
+        hidden_states = self.dropout(hidden_states)
+        # multiply by the second matrix
+        hidden_states = self.wo(hidden_states)
+        # add the residual connection and post-LN
+        hidden_states = self.layernorm(hidden_states + residual_connection)
+        return hidden_states
 class JinaBertLayer(nn.Module):
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         self.feed_forward_type = config.feed_forward_type
         if self.add_cross_attention:
             if not self.is_decoder:
                 raise ValueError(
         if self.feed_forward_type.endswith('glu'):
             self.mlp = JinaBertGLUMLP(config)
         else:
+            self.intermediate = JinaBertIntermediate(config)
+            self.output = JinaBertOutput(config)
     def forward(
         self,
         past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = (
             past_key_value[:2] if past_key_value is not None else None
             cross_attn_present_key_value = cross_attention_outputs[-1]
             present_key_value = present_key_value + cross_attn_present_key_value
+        if self.feed_forward_type.endswith('glu'):
+            layer_output = self.mlp(attention_output)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
         outputs = (layer_output,) + outputs
         # if decoder, return the attn key/values as the last output
         return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
 class JinaBertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
         )
         self.gradient_checkpointing = False
         self.num_attention_heads = config.num_attention_heads
+        self.register_buffer(
+            "alibi",
+            self.rebuild_alibi_tensor(size=config.max_position_embeddings),
+            persistent=False,
+        )
     def rebuild_alibi_tensor(
         self, size: int, device: Optional[Union[torch.device, str]] = None
         # Add alibi matrix to extended_attention_mask
         _, seqlen, _ = hidden_states.size()
+        if self._current_alibi_size < seqlen:
+            # Rebuild the alibi tensor when needed
+            warnings.warn(
+                f'Increasing alibi size from {self._current_alibi_size} to {seqlen}.'
+            )
+            self.register_buffer(
+                "alibi",
+                self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device).to(
+                    hidden_states.dtype
+                ),
+                persistent=False,
+            )
+        elif self.alibi.device != hidden_states.device:
+            # Device catch-up
+            self.alibi = self.alibi.to(hidden_states.device)
+        alibi_bias = self.alibi[:, :, :seqlen, :seqlen]
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(

special_tokens_map.json CHANGED Viewed

@@ -1,48 +1,34 @@
 {
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
   "cls_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "mask_token": {
-    "content": "<mask>",
-    "lstrip": true,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "pad_token": {
-    "content": "<pad>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "sep_token": {
-    "content": "</s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "unk_token": {
-    "content": "<unk>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

 {
   "cls_token": {
+    "content": "[CLS]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "pad_token": {
+    "content": "[PAD]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "sep_token": {
+    "content": "[SEP]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "unk_token": {
+    "content": "[UNK]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,57 +1,57 @@
 {
-  "add_prefix_space": false,
   "added_tokens_decoder": {
     "0": {
-      "content": "<s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "1": {
-      "content": "<pad>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "2": {
-      "content": "</s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "3": {
-      "content": "<unk>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "4": {
-      "content": "<mask>",
-      "lstrip": true,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
-  "bos_token": "<s>",
   "clean_up_tokenization_spaces": true,
-  "cls_token": "<s>",
-  "eos_token": "</s>",
-  "errors": "replace",
-  "mask_token": "<mask>",
-  "model_max_length": 8192,
-  "pad_token": "<pad>",
-  "sep_token": "</s>",
-  "tokenizer_class": "RobertaTokenizer",
-  "trim_offsets": true,
-  "unk_token": "<unk>"
 }

 {
   "added_tokens_decoder": {
     "0": {
+      "content": "[PAD]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "100": {
+      "content": "[UNK]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "101": {
+      "content": "[CLS]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "102": {
+      "content": "[SEP]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 2147483648,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66f6961d30fddaac6b19d888eb33f3dfa57407a889e89c6202660d4a2dc271fb
 size 4719

 version https://git-lfs.github.com/spec/v1
+oid sha256:db9c2a1f1e15a402ec8b4ea591e6d667a5f19b4e63a681ac8eff6f8a74adf67b
 size 4719

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff