florian-hoenicke
commited on
Commit
•
d2b8f89
1
Parent(s):
a7fc441
feat: push custom model
Browse files- README.md +5 -5
- config.json +4 -4
- configuration_bert.py +16 -25
- model.safetensors +2 -2
- modeling_bert.py +89 -44
- special_tokens_map.json +6 -20
- tokenizer.json +0 -0
- tokenizer_config.json +22 -22
- training_args.bin +1 -1
- vocab.txt +0 -0
README.md
CHANGED
@@ -12,14 +12,14 @@ tags:
|
|
12 |
- sentence-similarity
|
13 |
- mteb
|
14 |
- Ubuntu
|
15 |
-
- Linux
|
16 |
-
- Software
|
17 |
-
- OperatingSystem
|
18 |
- Technical
|
|
|
|
|
|
|
19 |
---
|
20 |
-
This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-
|
21 |
|
22 |
-
technical support
|
23 |
|
24 |
## How to Use
|
25 |
This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:
|
|
|
12 |
- sentence-similarity
|
13 |
- mteb
|
14 |
- Ubuntu
|
|
|
|
|
|
|
15 |
- Technical
|
16 |
+
- Support
|
17 |
+
- Linux
|
18 |
+
- Community
|
19 |
---
|
20 |
+
This model is a fine-tuned version of [**jinaai/jina-embeddings-v2-base-en**](https://huggingface.co/jinaai/jina-embeddings-v2-base-en) designed for the following use case:
|
21 |
|
22 |
+
technical support for Ubuntu
|
23 |
|
24 |
## How to Use
|
25 |
This model can be easily integrated into your NLP pipeline for tasks such as text classification, sentiment analysis, entity recognition, and more. Here's a simple example to get you started:
|
config.json
CHANGED
@@ -8,15 +8,15 @@
|
|
8 |
"auto_map": {
|
9 |
"AutoConfig": "configuration_bert.JinaBertConfig",
|
10 |
"AutoModel": "modeling_bert.JinaBertModel",
|
11 |
-
"AutoModelForMaskedLM": "jinaai/jina-bert-
|
12 |
-
"AutoModelForSequenceClassification": "jinaai/jina-bert-
|
13 |
},
|
14 |
"classifier_dropout": null,
|
15 |
"emb_pooler": "mean",
|
16 |
"feed_forward_type": "geglu",
|
17 |
"gradient_checkpointing": false,
|
18 |
"hidden_act": "gelu",
|
19 |
-
"hidden_dropout_prob": 0.
|
20 |
"hidden_size": 768,
|
21 |
"initializer_range": 0.02,
|
22 |
"intermediate_size": 3072,
|
@@ -32,5 +32,5 @@
|
|
32 |
"transformers_version": "4.40.2",
|
33 |
"type_vocab_size": 2,
|
34 |
"use_cache": true,
|
35 |
-
"vocab_size":
|
36 |
}
|
|
|
8 |
"auto_map": {
|
9 |
"AutoConfig": "configuration_bert.JinaBertConfig",
|
10 |
"AutoModel": "modeling_bert.JinaBertModel",
|
11 |
+
"AutoModelForMaskedLM": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForMaskedLM",
|
12 |
+
"AutoModelForSequenceClassification": "jinaai/jina-bert-implementation--modeling_bert.JinaBertForSequenceClassification"
|
13 |
},
|
14 |
"classifier_dropout": null,
|
15 |
"emb_pooler": "mean",
|
16 |
"feed_forward_type": "geglu",
|
17 |
"gradient_checkpointing": false,
|
18 |
"hidden_act": "gelu",
|
19 |
+
"hidden_dropout_prob": 0.1,
|
20 |
"hidden_size": 768,
|
21 |
"initializer_range": 0.02,
|
22 |
"intermediate_size": 3072,
|
|
|
32 |
"transformers_version": "4.40.2",
|
33 |
"type_vocab_size": 2,
|
34 |
"use_cache": true,
|
35 |
+
"vocab_size": 30528
|
36 |
}
|
configuration_bert.py
CHANGED
@@ -17,18 +17,11 @@
|
|
17 |
""" BERT model configuration"""
|
18 |
from collections import OrderedDict
|
19 |
from typing import Mapping
|
20 |
-
import warnings
|
21 |
|
22 |
from transformers.configuration_utils import PretrainedConfig
|
|
|
23 |
from transformers.utils import logging
|
24 |
|
25 |
-
try:
|
26 |
-
from optimum.exporters.onnx.model_configs import BertOnnxConfig
|
27 |
-
OPTIMUM_INSTALLED = True
|
28 |
-
except ImportError:
|
29 |
-
warnings.warn("optimum is not installed. To use OnnxConfig and BertOnnxConfig, make sure that `optimum` package is installed")
|
30 |
-
OPTIMUM_INSTALLED = False
|
31 |
-
|
32 |
|
33 |
logger = logging.get_logger(__name__)
|
34 |
|
@@ -135,7 +128,7 @@ class JinaBertConfig(PretrainedConfig):
|
|
135 |
classifier_dropout=None,
|
136 |
feed_forward_type="original",
|
137 |
emb_pooler=None,
|
138 |
-
attn_implementation=
|
139 |
**kwargs,
|
140 |
):
|
141 |
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
@@ -159,19 +152,17 @@ class JinaBertConfig(PretrainedConfig):
|
|
159 |
self.emb_pooler = emb_pooler
|
160 |
self.attn_implementation = attn_implementation
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
]
|
177 |
-
)
|
|
|
17 |
""" BERT model configuration"""
|
18 |
from collections import OrderedDict
|
19 |
from typing import Mapping
|
|
|
20 |
|
21 |
from transformers.configuration_utils import PretrainedConfig
|
22 |
+
from transformers.onnx import OnnxConfig
|
23 |
from transformers.utils import logging
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
logger = logging.get_logger(__name__)
|
27 |
|
|
|
128 |
classifier_dropout=None,
|
129 |
feed_forward_type="original",
|
130 |
emb_pooler=None,
|
131 |
+
attn_implementation='torch',
|
132 |
**kwargs,
|
133 |
):
|
134 |
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
|
|
152 |
self.emb_pooler = emb_pooler
|
153 |
self.attn_implementation = attn_implementation
|
154 |
|
155 |
+
class JinaBertOnnxConfig(OnnxConfig):
|
156 |
+
@property
|
157 |
+
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
158 |
+
if self.task == "multiple-choice":
|
159 |
+
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
|
160 |
+
else:
|
161 |
+
dynamic_axis = {0: "batch", 1: "sequence"}
|
162 |
+
return OrderedDict(
|
163 |
+
[
|
164 |
+
("input_ids", dynamic_axis),
|
165 |
+
("attention_mask", dynamic_axis),
|
166 |
+
("token_type_ids", dynamic_axis),
|
167 |
+
]
|
168 |
+
)
|
|
|
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5cbde1a065989fc5e605ac6d44f15ee212a5bbe0e7af7c9a3a045d1ada6de5f
|
3 |
+
size 549493968
|
modeling_bert.py
CHANGED
@@ -280,10 +280,9 @@ class JinaBertSelfAttention(nn.Module):
|
|
280 |
self.query = nn.Linear(config.hidden_size, self.all_head_size)
|
281 |
self.key = nn.Linear(config.hidden_size, self.all_head_size)
|
282 |
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
283 |
-
self.layer_norm_q = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
284 |
-
self.layer_norm_k = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
285 |
|
286 |
-
self.
|
|
|
287 |
self.position_embedding_type = position_embedding_type or getattr(
|
288 |
config, "position_embedding_type", "absolute"
|
289 |
)
|
@@ -317,7 +316,7 @@ class JinaBertSelfAttention(nn.Module):
|
|
317 |
output_attentions: Optional[bool] = False,
|
318 |
bias: Optional[torch.FloatTensor] = None,
|
319 |
) -> Tuple[torch.Tensor]:
|
320 |
-
mixed_query_layer = self.
|
321 |
|
322 |
# If this is instantiated as a cross-attention module, the keys
|
323 |
# and values come from an encoder; the attention mask needs to be
|
@@ -330,16 +329,16 @@ class JinaBertSelfAttention(nn.Module):
|
|
330 |
value_layer = past_key_value[1]
|
331 |
attention_mask = encoder_attention_mask
|
332 |
elif is_cross_attention:
|
333 |
-
key_layer = self.transpose_for_scores(self.
|
334 |
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
|
335 |
attention_mask = encoder_attention_mask
|
336 |
elif past_key_value is not None:
|
337 |
-
key_layer = self.transpose_for_scores(self.
|
338 |
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
339 |
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
|
340 |
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
|
341 |
else:
|
342 |
-
key_layer = self.transpose_for_scores(self.
|
343 |
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
344 |
|
345 |
query_layer = self.transpose_for_scores(mixed_query_layer)
|
@@ -358,7 +357,8 @@ class JinaBertSelfAttention(nn.Module):
|
|
358 |
if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
|
359 |
b, _, s, _ = query_layer.shape
|
360 |
new_bias = attention_mask + bias
|
361 |
-
|
|
|
362 |
attn = attn.permute(0, 2, 1, 3).contiguous()
|
363 |
return (attn.view(b, s, self.all_head_size),)
|
364 |
|
@@ -431,7 +431,7 @@ class JinaBertSelfAttention(nn.Module):
|
|
431 |
context_layer = context_layer.view(new_context_layer_shape)
|
432 |
|
433 |
outputs = (
|
434 |
-
(context_layer,
|
435 |
)
|
436 |
|
437 |
if self.is_decoder:
|
@@ -515,29 +515,44 @@ class JinaBertAttention(nn.Module):
|
|
515 |
return outputs
|
516 |
|
517 |
|
518 |
-
class
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
519 |
def __init__(self, config: JinaBertConfig):
|
520 |
super().__init__()
|
521 |
-
self.
|
522 |
-
self.
|
523 |
-
self.up_layer = nn.Linear(
|
524 |
-
config.hidden_size, config.intermediate_size, bias=False
|
525 |
-
)
|
526 |
-
self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
|
527 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
528 |
|
529 |
-
def forward(
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
|
|
535 |
|
536 |
|
537 |
class JinaBertGLUMLP(nn.Module):
|
538 |
def __init__(self, config: JinaBertConfig):
|
539 |
super().__init__()
|
540 |
self.config = config
|
|
|
|
|
|
|
541 |
if config.feed_forward_type == 'reglu':
|
542 |
self.act = nn.ReLU()
|
543 |
elif config.feed_forward_type == 'geglu':
|
@@ -546,21 +561,23 @@ class JinaBertGLUMLP(nn.Module):
|
|
546 |
raise ValueError(
|
547 |
f"feed_forward_type {config.feed_forward_type} not supported"
|
548 |
)
|
549 |
-
self.
|
550 |
-
config.hidden_size, config.intermediate_size * 2, bias=False
|
551 |
-
)
|
552 |
-
self.down_layer = nn.Linear(config.intermediate_size, config.hidden_size)
|
553 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
|
554 |
|
555 |
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
gated =
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
|
|
|
|
|
|
|
|
564 |
|
565 |
|
566 |
class JinaBertLayer(nn.Module):
|
@@ -572,8 +589,6 @@ class JinaBertLayer(nn.Module):
|
|
572 |
self.is_decoder = config.is_decoder
|
573 |
self.add_cross_attention = config.add_cross_attention
|
574 |
self.feed_forward_type = config.feed_forward_type
|
575 |
-
self.layer_norm_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
576 |
-
self.layer_norm_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
577 |
if self.add_cross_attention:
|
578 |
if not self.is_decoder:
|
579 |
raise ValueError(
|
@@ -585,7 +600,8 @@ class JinaBertLayer(nn.Module):
|
|
585 |
if self.feed_forward_type.endswith('glu'):
|
586 |
self.mlp = JinaBertGLUMLP(config)
|
587 |
else:
|
588 |
-
self.
|
|
|
589 |
|
590 |
def forward(
|
591 |
self,
|
@@ -598,9 +614,6 @@ class JinaBertLayer(nn.Module):
|
|
598 |
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
599 |
output_attentions: Optional[bool] = False,
|
600 |
) -> Tuple[torch.Tensor]:
|
601 |
-
# Pre-Norm
|
602 |
-
residual = hidden_states
|
603 |
-
|
604 |
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
|
605 |
self_attn_past_key_value = (
|
606 |
past_key_value[:2] if past_key_value is not None else None
|
@@ -654,9 +667,15 @@ class JinaBertLayer(nn.Module):
|
|
654 |
cross_attn_present_key_value = cross_attention_outputs[-1]
|
655 |
present_key_value = present_key_value + cross_attn_present_key_value
|
656 |
|
657 |
-
|
658 |
-
|
659 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
660 |
outputs = (layer_output,) + outputs
|
661 |
|
662 |
# if decoder, return the attn key/values as the last output
|
@@ -665,6 +684,11 @@ class JinaBertLayer(nn.Module):
|
|
665 |
|
666 |
return outputs
|
667 |
|
|
|
|
|
|
|
|
|
|
|
668 |
|
669 |
class JinaBertEncoder(nn.Module):
|
670 |
def __init__(self, config: JinaBertConfig):
|
@@ -675,6 +699,11 @@ class JinaBertEncoder(nn.Module):
|
|
675 |
)
|
676 |
self.gradient_checkpointing = False
|
677 |
self.num_attention_heads = config.num_attention_heads
|
|
|
|
|
|
|
|
|
|
|
678 |
|
679 |
def rebuild_alibi_tensor(
|
680 |
self, size: int, device: Optional[Union[torch.device, str]] = None
|
@@ -742,7 +771,23 @@ class JinaBertEncoder(nn.Module):
|
|
742 |
|
743 |
# Add alibi matrix to extended_attention_mask
|
744 |
_, seqlen, _ = hidden_states.size()
|
745 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
746 |
if self.gradient_checkpointing and self.training:
|
747 |
if use_cache:
|
748 |
logger.warning_once(
|
|
|
280 |
self.query = nn.Linear(config.hidden_size, self.all_head_size)
|
281 |
self.key = nn.Linear(config.hidden_size, self.all_head_size)
|
282 |
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
|
|
|
|
283 |
|
284 |
+
self.dropout_p = config.attention_probs_dropout_prob
|
285 |
+
self.dropout = nn.Dropout(self.dropout_p)
|
286 |
self.position_embedding_type = position_embedding_type or getattr(
|
287 |
config, "position_embedding_type", "absolute"
|
288 |
)
|
|
|
316 |
output_attentions: Optional[bool] = False,
|
317 |
bias: Optional[torch.FloatTensor] = None,
|
318 |
) -> Tuple[torch.Tensor]:
|
319 |
+
mixed_query_layer = self.query(hidden_states)
|
320 |
|
321 |
# If this is instantiated as a cross-attention module, the keys
|
322 |
# and values come from an encoder; the attention mask needs to be
|
|
|
329 |
value_layer = past_key_value[1]
|
330 |
attention_mask = encoder_attention_mask
|
331 |
elif is_cross_attention:
|
332 |
+
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
|
333 |
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
|
334 |
attention_mask = encoder_attention_mask
|
335 |
elif past_key_value is not None:
|
336 |
+
key_layer = self.transpose_for_scores(self.key(hidden_states))
|
337 |
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
338 |
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
|
339 |
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
|
340 |
else:
|
341 |
+
key_layer = self.transpose_for_scores(self.key(hidden_states))
|
342 |
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
343 |
|
344 |
query_layer = self.transpose_for_scores(mixed_query_layer)
|
|
|
357 |
if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
|
358 |
b, _, s, _ = query_layer.shape
|
359 |
new_bias = attention_mask + bias
|
360 |
+
dropout_p = self.dropout_p if self.training else 0.0
|
361 |
+
attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias, dropout_p=dropout_p)
|
362 |
attn = attn.permute(0, 2, 1, 3).contiguous()
|
363 |
return (attn.view(b, s, self.all_head_size),)
|
364 |
|
|
|
431 |
context_layer = context_layer.view(new_context_layer_shape)
|
432 |
|
433 |
outputs = (
|
434 |
+
(context_layer, attention_probs) if output_attentions else (context_layer,)
|
435 |
)
|
436 |
|
437 |
if self.is_decoder:
|
|
|
515 |
return outputs
|
516 |
|
517 |
|
518 |
+
class JinaBertIntermediate(nn.Module):
|
519 |
+
def __init__(self, config):
|
520 |
+
super().__init__()
|
521 |
+
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
522 |
+
if isinstance(config.hidden_act, str):
|
523 |
+
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
524 |
+
else:
|
525 |
+
self.intermediate_act_fn = config.hidden_act
|
526 |
+
|
527 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
528 |
+
hidden_states = self.dense(hidden_states)
|
529 |
+
hidden_states = self.intermediate_act_fn(hidden_states)
|
530 |
+
return hidden_states
|
531 |
+
|
532 |
+
|
533 |
+
class JinaBertOutput(nn.Module):
|
534 |
def __init__(self, config: JinaBertConfig):
|
535 |
super().__init__()
|
536 |
+
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
|
537 |
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
|
|
|
|
|
|
|
538 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
539 |
|
540 |
+
def forward(
|
541 |
+
self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
|
542 |
+
) -> torch.Tensor:
|
543 |
+
hidden_states = self.dense(hidden_states)
|
544 |
+
hidden_states = self.dropout(hidden_states)
|
545 |
+
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
546 |
+
return hidden_states
|
547 |
|
548 |
|
549 |
class JinaBertGLUMLP(nn.Module):
|
550 |
def __init__(self, config: JinaBertConfig):
|
551 |
super().__init__()
|
552 |
self.config = config
|
553 |
+
self.gated_layers = nn.Linear(
|
554 |
+
config.hidden_size, config.intermediate_size * 2, bias=False
|
555 |
+
)
|
556 |
if config.feed_forward_type == 'reglu':
|
557 |
self.act = nn.ReLU()
|
558 |
elif config.feed_forward_type == 'geglu':
|
|
|
561 |
raise ValueError(
|
562 |
f"feed_forward_type {config.feed_forward_type} not supported"
|
563 |
)
|
564 |
+
self.wo = nn.Linear(config.intermediate_size, config.hidden_size)
|
|
|
|
|
|
|
565 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
566 |
+
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
567 |
|
568 |
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
569 |
+
residual_connection = hidden_states
|
570 |
+
# compute the activation
|
571 |
+
hidden_states = self.gated_layers(hidden_states)
|
572 |
+
gated = hidden_states[:, :, : self.config.intermediate_size]
|
573 |
+
non_gated = hidden_states[:, :, self.config.intermediate_size :]
|
574 |
+
hidden_states = self.act(gated) * non_gated
|
575 |
+
hidden_states = self.dropout(hidden_states)
|
576 |
+
# multiply by the second matrix
|
577 |
+
hidden_states = self.wo(hidden_states)
|
578 |
+
# add the residual connection and post-LN
|
579 |
+
hidden_states = self.layernorm(hidden_states + residual_connection)
|
580 |
+
return hidden_states
|
581 |
|
582 |
|
583 |
class JinaBertLayer(nn.Module):
|
|
|
589 |
self.is_decoder = config.is_decoder
|
590 |
self.add_cross_attention = config.add_cross_attention
|
591 |
self.feed_forward_type = config.feed_forward_type
|
|
|
|
|
592 |
if self.add_cross_attention:
|
593 |
if not self.is_decoder:
|
594 |
raise ValueError(
|
|
|
600 |
if self.feed_forward_type.endswith('glu'):
|
601 |
self.mlp = JinaBertGLUMLP(config)
|
602 |
else:
|
603 |
+
self.intermediate = JinaBertIntermediate(config)
|
604 |
+
self.output = JinaBertOutput(config)
|
605 |
|
606 |
def forward(
|
607 |
self,
|
|
|
614 |
past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
615 |
output_attentions: Optional[bool] = False,
|
616 |
) -> Tuple[torch.Tensor]:
|
|
|
|
|
|
|
617 |
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
|
618 |
self_attn_past_key_value = (
|
619 |
past_key_value[:2] if past_key_value is not None else None
|
|
|
667 |
cross_attn_present_key_value = cross_attention_outputs[-1]
|
668 |
present_key_value = present_key_value + cross_attn_present_key_value
|
669 |
|
670 |
+
if self.feed_forward_type.endswith('glu'):
|
671 |
+
layer_output = self.mlp(attention_output)
|
672 |
+
else:
|
673 |
+
layer_output = apply_chunking_to_forward(
|
674 |
+
self.feed_forward_chunk,
|
675 |
+
self.chunk_size_feed_forward,
|
676 |
+
self.seq_len_dim,
|
677 |
+
attention_output,
|
678 |
+
)
|
679 |
outputs = (layer_output,) + outputs
|
680 |
|
681 |
# if decoder, return the attn key/values as the last output
|
|
|
684 |
|
685 |
return outputs
|
686 |
|
687 |
+
def feed_forward_chunk(self, attention_output):
|
688 |
+
intermediate_output = self.intermediate(attention_output)
|
689 |
+
layer_output = self.output(intermediate_output, attention_output)
|
690 |
+
return layer_output
|
691 |
+
|
692 |
|
693 |
class JinaBertEncoder(nn.Module):
|
694 |
def __init__(self, config: JinaBertConfig):
|
|
|
699 |
)
|
700 |
self.gradient_checkpointing = False
|
701 |
self.num_attention_heads = config.num_attention_heads
|
702 |
+
self.register_buffer(
|
703 |
+
"alibi",
|
704 |
+
self.rebuild_alibi_tensor(size=config.max_position_embeddings),
|
705 |
+
persistent=False,
|
706 |
+
)
|
707 |
|
708 |
def rebuild_alibi_tensor(
|
709 |
self, size: int, device: Optional[Union[torch.device, str]] = None
|
|
|
771 |
|
772 |
# Add alibi matrix to extended_attention_mask
|
773 |
_, seqlen, _ = hidden_states.size()
|
774 |
+
if self._current_alibi_size < seqlen:
|
775 |
+
# Rebuild the alibi tensor when needed
|
776 |
+
warnings.warn(
|
777 |
+
f'Increasing alibi size from {self._current_alibi_size} to {seqlen}.'
|
778 |
+
)
|
779 |
+
self.register_buffer(
|
780 |
+
"alibi",
|
781 |
+
self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device).to(
|
782 |
+
hidden_states.dtype
|
783 |
+
),
|
784 |
+
persistent=False,
|
785 |
+
)
|
786 |
+
elif self.alibi.device != hidden_states.device:
|
787 |
+
# Device catch-up
|
788 |
+
self.alibi = self.alibi.to(hidden_states.device)
|
789 |
+
|
790 |
+
alibi_bias = self.alibi[:, :, :seqlen, :seqlen]
|
791 |
if self.gradient_checkpointing and self.training:
|
792 |
if use_cache:
|
793 |
logger.warning_once(
|
special_tokens_map.json
CHANGED
@@ -1,48 +1,34 @@
|
|
1 |
{
|
2 |
-
"bos_token": {
|
3 |
-
"content": "<s>",
|
4 |
-
"lstrip": false,
|
5 |
-
"normalized": false,
|
6 |
-
"rstrip": false,
|
7 |
-
"single_word": false
|
8 |
-
},
|
9 |
"cls_token": {
|
10 |
-
"content": "
|
11 |
-
"lstrip": false,
|
12 |
-
"normalized": false,
|
13 |
-
"rstrip": false,
|
14 |
-
"single_word": false
|
15 |
-
},
|
16 |
-
"eos_token": {
|
17 |
-
"content": "</s>",
|
18 |
"lstrip": false,
|
19 |
"normalized": false,
|
20 |
"rstrip": false,
|
21 |
"single_word": false
|
22 |
},
|
23 |
"mask_token": {
|
24 |
-
"content": "
|
25 |
-
"lstrip":
|
26 |
"normalized": false,
|
27 |
"rstrip": false,
|
28 |
"single_word": false
|
29 |
},
|
30 |
"pad_token": {
|
31 |
-
"content": "
|
32 |
"lstrip": false,
|
33 |
"normalized": false,
|
34 |
"rstrip": false,
|
35 |
"single_word": false
|
36 |
},
|
37 |
"sep_token": {
|
38 |
-
"content": "
|
39 |
"lstrip": false,
|
40 |
"normalized": false,
|
41 |
"rstrip": false,
|
42 |
"single_word": false
|
43 |
},
|
44 |
"unk_token": {
|
45 |
-
"content": "
|
46 |
"lstrip": false,
|
47 |
"normalized": false,
|
48 |
"rstrip": false,
|
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"lstrip": false,
|
5 |
"normalized": false,
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
},
|
9 |
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
"normalized": false,
|
13 |
"rstrip": false,
|
14 |
"single_word": false
|
15 |
},
|
16 |
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
"lstrip": false,
|
19 |
"normalized": false,
|
20 |
"rstrip": false,
|
21 |
"single_word": false
|
22 |
},
|
23 |
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
"lstrip": false,
|
26 |
"normalized": false,
|
27 |
"rstrip": false,
|
28 |
"single_word": false
|
29 |
},
|
30 |
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
"lstrip": false,
|
33 |
"normalized": false,
|
34 |
"rstrip": false,
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -1,57 +1,57 @@
|
|
1 |
{
|
2 |
-
"add_prefix_space": false,
|
3 |
"added_tokens_decoder": {
|
4 |
"0": {
|
5 |
-
"content": "
|
6 |
"lstrip": false,
|
7 |
"normalized": false,
|
8 |
"rstrip": false,
|
9 |
"single_word": false,
|
10 |
"special": true
|
11 |
},
|
12 |
-
"
|
13 |
-
"content": "
|
14 |
"lstrip": false,
|
15 |
"normalized": false,
|
16 |
"rstrip": false,
|
17 |
"single_word": false,
|
18 |
"special": true
|
19 |
},
|
20 |
-
"
|
21 |
-
"content": "
|
22 |
"lstrip": false,
|
23 |
"normalized": false,
|
24 |
"rstrip": false,
|
25 |
"single_word": false,
|
26 |
"special": true
|
27 |
},
|
28 |
-
"
|
29 |
-
"content": "
|
30 |
"lstrip": false,
|
31 |
"normalized": false,
|
32 |
"rstrip": false,
|
33 |
"single_word": false,
|
34 |
"special": true
|
35 |
},
|
36 |
-
"
|
37 |
-
"content": "
|
38 |
-
"lstrip":
|
39 |
"normalized": false,
|
40 |
"rstrip": false,
|
41 |
"single_word": false,
|
42 |
"special": true
|
43 |
}
|
44 |
},
|
45 |
-
"bos_token": "<s>",
|
46 |
"clean_up_tokenization_spaces": true,
|
47 |
-
"cls_token": "
|
48 |
-
"
|
49 |
-
"
|
50 |
-
"mask_token": "
|
51 |
-
"model_max_length":
|
52 |
-
"
|
53 |
-
"
|
54 |
-
"
|
55 |
-
"
|
56 |
-
"
|
|
|
|
|
57 |
}
|
|
|
1 |
{
|
|
|
2 |
"added_tokens_decoder": {
|
3 |
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
"lstrip": false,
|
6 |
"normalized": false,
|
7 |
"rstrip": false,
|
8 |
"single_word": false,
|
9 |
"special": true
|
10 |
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
"lstrip": false,
|
14 |
"normalized": false,
|
15 |
"rstrip": false,
|
16 |
"single_word": false,
|
17 |
"special": true
|
18 |
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
"lstrip": false,
|
22 |
"normalized": false,
|
23 |
"rstrip": false,
|
24 |
"single_word": false,
|
25 |
"special": true
|
26 |
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
"lstrip": false,
|
30 |
"normalized": false,
|
31 |
"rstrip": false,
|
32 |
"single_word": false,
|
33 |
"special": true
|
34 |
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
"normalized": false,
|
39 |
"rstrip": false,
|
40 |
"single_word": false,
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
|
|
44 |
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"model_max_length": 2147483648,
|
50 |
+
"never_split": null,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"strip_accents": null,
|
54 |
+
"tokenize_chinese_chars": true,
|
55 |
+
"tokenizer_class": "BertTokenizer",
|
56 |
+
"unk_token": "[UNK]"
|
57 |
}
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4719
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db9c2a1f1e15a402ec8b4ea591e6d667a5f19b4e63a681ac8eff6f8a74adf67b
|
3 |
size 4719
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|