update README.md
Browse files
README.md
CHANGED
@@ -2614,7 +2614,6 @@ import torch.nn.functional as F
|
|
2614 |
|
2615 |
from torch import Tensor
|
2616 |
from transformers import AutoTokenizer, AutoModel
|
2617 |
-
from transformers.modeling_outputs import BaseModelOutput
|
2618 |
|
2619 |
|
2620 |
def average_pool(last_hidden_states: Tensor,
|
@@ -2636,7 +2635,7 @@ model = AutoModel.from_pretrained('intfloat/e5-large')
|
|
2636 |
# Tokenize the input texts
|
2637 |
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
|
2638 |
|
2639 |
-
outputs
|
2640 |
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
2641 |
|
2642 |
# (Optionally) normalize embeddings
|
@@ -2654,3 +2653,20 @@ Please refer to our paper at [https://arxiv.org/pdf/2212.03533.pdf](https://arxi
|
|
2654 |
Check out [unilm/e5](https://github.com/microsoft/unilm/tree/master/e5) to reproduce evaluation results
|
2655 |
on the [BEIR](https://arxiv.org/abs/2104.08663) and [MTEB benchmark](https://arxiv.org/abs/2210.07316).
|
2656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2614 |
|
2615 |
from torch import Tensor
|
2616 |
from transformers import AutoTokenizer, AutoModel
|
|
|
2617 |
|
2618 |
|
2619 |
def average_pool(last_hidden_states: Tensor,
|
|
|
2635 |
# Tokenize the input texts
|
2636 |
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
|
2637 |
|
2638 |
+
outputs = model(**batch_dict)
|
2639 |
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
2640 |
|
2641 |
# (Optionally) normalize embeddings
|
|
|
2653 |
Check out [unilm/e5](https://github.com/microsoft/unilm/tree/master/e5) to reproduce evaluation results
|
2654 |
on the [BEIR](https://arxiv.org/abs/2104.08663) and [MTEB benchmark](https://arxiv.org/abs/2210.07316).
|
2655 |
|
2656 |
+
## Citation
|
2657 |
+
|
2658 |
+
If you find our paper or models helpful, please consider cite as follows:
|
2659 |
+
|
2660 |
+
```
|
2661 |
+
@article{wang2022text,
|
2662 |
+
title={Text Embeddings by Weakly-Supervised Contrastive Pre-training},
|
2663 |
+
author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Jiao, Binxing and Yang, Linjun and Jiang, Daxin and Majumder, Rangan and Wei, Furu},
|
2664 |
+
journal={arXiv preprint arXiv:2212.03533},
|
2665 |
+
year={2022}
|
2666 |
+
}
|
2667 |
+
```
|
2668 |
+
|
2669 |
+
## Limitations
|
2670 |
+
|
2671 |
+
This model only works for English texts. Long texts will be truncated to at most 512 tokens.
|
2672 |
+
|