Support sentence_transformers and fix readme
#2
by
izhx
- opened
- 1_Pooling/config.json +10 -0
- README.md +25 -16
- modules.json +14 -0
- sentence_bert_config.json +4 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 1024,
|
3 |
+
"pooling_mode_cls_token": true,
|
4 |
+
"pooling_mode_mean_tokens": false,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
---
|
|
|
|
|
2 |
library_name: transformers
|
3 |
tags:
|
4 |
- gte
|
@@ -2168,7 +2170,7 @@ model-index:
|
|
2168 |
- type: mrr_at_1000
|
2169 |
value: 78.704
|
2170 |
- type: mrr_at_3
|
2171 |
-
value: 77
|
2172 |
- type: mrr_at_5
|
2173 |
value: 78.083
|
2174 |
- type: ndcg_at_1
|
@@ -2202,7 +2204,7 @@ model-index:
|
|
2202 |
- type: recall_at_100
|
2203 |
value: 99.833
|
2204 |
- type: recall_at_1000
|
2205 |
-
value: 100
|
2206 |
- type: recall_at_3
|
2207 |
value: 86.506
|
2208 |
- type: recall_at_5
|
@@ -2245,7 +2247,7 @@ model-index:
|
|
2245 |
- type: euclidean_precision
|
2246 |
value: 85.74181117533719
|
2247 |
- type: euclidean_recall
|
2248 |
-
value: 89
|
2249 |
- type: manhattan_accuracy
|
2250 |
value: 99.75445544554455
|
2251 |
- type: manhattan_ap
|
@@ -2336,19 +2338,19 @@ model-index:
|
|
2336 |
- type: map_at_5
|
2337 |
value: 1.028
|
2338 |
- type: mrr_at_1
|
2339 |
-
value: 88
|
2340 |
- type: mrr_at_10
|
2341 |
-
value: 94
|
2342 |
- type: mrr_at_100
|
2343 |
-
value: 94
|
2344 |
- type: mrr_at_1000
|
2345 |
-
value: 94
|
2346 |
- type: mrr_at_3
|
2347 |
-
value: 94
|
2348 |
- type: mrr_at_5
|
2349 |
-
value: 94
|
2350 |
- type: ndcg_at_1
|
2351 |
-
value: 82
|
2352 |
- type: ndcg_at_10
|
2353 |
value: 77.48899999999999
|
2354 |
- type: ndcg_at_100
|
@@ -2360,7 +2362,7 @@ model-index:
|
|
2360 |
- type: ndcg_at_5
|
2361 |
value: 80.449
|
2362 |
- type: precision_at_1
|
2363 |
-
value: 88
|
2364 |
- type: precision_at_10
|
2365 |
value: 82.19999999999999
|
2366 |
- type: precision_at_100
|
@@ -2368,7 +2370,7 @@ model-index:
|
|
2368 |
- type: precision_at_1000
|
2369 |
value: 23.684
|
2370 |
- type: precision_at_3
|
2371 |
-
value: 88
|
2372 |
- type: precision_at_5
|
2373 |
value: 85.6
|
2374 |
- type: recall_at_1
|
@@ -2627,7 +2629,7 @@ We also present the [`gte-Qwen1.5-7B-instruct`](https://huggingface.co/Alibaba-N
|
|
2627 |
| Models | Language | Model Size | Max Seq. Length | Dimension | MTEB-en | LoCo |
|
2628 |
|:-----: | :-----: |:-----: |:-----: |:-----: | :-----: | :-----: |
|
2629 |
|[`gte-Qwen1.5-7B-instruct`](https://huggingface.co/Alibaba-NLP/gte-Qwen1.5-7B-instruct)| English | 7720 | 32768 | 4096 | 67.34 | 87.57 |
|
2630 |
-
|[`gte-large-en-v1.5`](https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5) | English |
|
2631 |
|[`gte-base-en-v1.5`](https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5) | English | 137 | 8192 | 768 | 64.11 | 87.44 |
|
2632 |
|
2633 |
|
@@ -2673,7 +2675,7 @@ from sentence_transformers.util import cos_sim
|
|
2673 |
|
2674 |
sentences = ['That is a happy person', 'That is a very happy person']
|
2675 |
|
2676 |
-
model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5')
|
2677 |
embeddings = model.encode(sentences)
|
2678 |
print(cos_sim(embeddings[0], embeddings[1]))
|
2679 |
```
|
@@ -2688,6 +2690,11 @@ print(cos_sim(embeddings[0], embeddings[1]))
|
|
2688 |
|
2689 |
### Training Procedure
|
2690 |
|
|
|
|
|
|
|
|
|
|
|
2691 |
- MLM-512: lr 2e-4, mlm_probability 0.3, batch_size 4096, num_steps 300000, rope_base 10000
|
2692 |
- MLM-2048: lr 5e-5, mlm_probability 0.3, batch_size 4096, num_steps 30000, rope_base 10000
|
2693 |
- MLM-8192: lr 5e-5, mlm_probability 0.3, batch_size 1024, num_steps 30000, rope_base 160000
|
@@ -2700,7 +2707,9 @@ print(cos_sim(embeddings[0], embeddings[1]))
|
|
2700 |
|
2701 |
### MTEB
|
2702 |
|
2703 |
-
The
|
|
|
|
|
2704 |
|
2705 |
| Model Name | Param Size (M) | Dimension | Sequence Length | Average (56) | Class. (12) | Clust. (11) | Pair Class. (3) | Reran. (4) | Retr. (15) | STS (10) | Summ. (1) |
|
2706 |
|:----:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
|
@@ -2732,4 +2741,4 @@ The gte results setting: `mteb==1.2.0, fp16 auto mix precision, max_length=8192`
|
|
2732 |
|
2733 |
**APA:**
|
2734 |
|
2735 |
-
[More Information Needed]
|
|
|
1 |
---
|
2 |
+
datasets:
|
3 |
+
- allenai/c4
|
4 |
library_name: transformers
|
5 |
tags:
|
6 |
- gte
|
|
|
2170 |
- type: mrr_at_1000
|
2171 |
value: 78.704
|
2172 |
- type: mrr_at_3
|
2173 |
+
value: 77
|
2174 |
- type: mrr_at_5
|
2175 |
value: 78.083
|
2176 |
- type: ndcg_at_1
|
|
|
2204 |
- type: recall_at_100
|
2205 |
value: 99.833
|
2206 |
- type: recall_at_1000
|
2207 |
+
value: 100
|
2208 |
- type: recall_at_3
|
2209 |
value: 86.506
|
2210 |
- type: recall_at_5
|
|
|
2247 |
- type: euclidean_precision
|
2248 |
value: 85.74181117533719
|
2249 |
- type: euclidean_recall
|
2250 |
+
value: 89
|
2251 |
- type: manhattan_accuracy
|
2252 |
value: 99.75445544554455
|
2253 |
- type: manhattan_ap
|
|
|
2338 |
- type: map_at_5
|
2339 |
value: 1.028
|
2340 |
- type: mrr_at_1
|
2341 |
+
value: 88
|
2342 |
- type: mrr_at_10
|
2343 |
+
value: 94
|
2344 |
- type: mrr_at_100
|
2345 |
+
value: 94
|
2346 |
- type: mrr_at_1000
|
2347 |
+
value: 94
|
2348 |
- type: mrr_at_3
|
2349 |
+
value: 94
|
2350 |
- type: mrr_at_5
|
2351 |
+
value: 94
|
2352 |
- type: ndcg_at_1
|
2353 |
+
value: 82
|
2354 |
- type: ndcg_at_10
|
2355 |
value: 77.48899999999999
|
2356 |
- type: ndcg_at_100
|
|
|
2362 |
- type: ndcg_at_5
|
2363 |
value: 80.449
|
2364 |
- type: precision_at_1
|
2365 |
+
value: 88
|
2366 |
- type: precision_at_10
|
2367 |
value: 82.19999999999999
|
2368 |
- type: precision_at_100
|
|
|
2370 |
- type: precision_at_1000
|
2371 |
value: 23.684
|
2372 |
- type: precision_at_3
|
2373 |
+
value: 88
|
2374 |
- type: precision_at_5
|
2375 |
value: 85.6
|
2376 |
- type: recall_at_1
|
|
|
2629 |
| Models | Language | Model Size | Max Seq. Length | Dimension | MTEB-en | LoCo |
|
2630 |
|:-----: | :-----: |:-----: |:-----: |:-----: | :-----: | :-----: |
|
2631 |
|[`gte-Qwen1.5-7B-instruct`](https://huggingface.co/Alibaba-NLP/gte-Qwen1.5-7B-instruct)| English | 7720 | 32768 | 4096 | 67.34 | 87.57 |
|
2632 |
+
|[`gte-large-en-v1.5`](https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5) | English | 434 | 8192 | 1024 | 65.39 | 86.71 |
|
2633 |
|[`gte-base-en-v1.5`](https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5) | English | 137 | 8192 | 768 | 64.11 | 87.44 |
|
2634 |
|
2635 |
|
|
|
2675 |
|
2676 |
sentences = ['That is a happy person', 'That is a very happy person']
|
2677 |
|
2678 |
+
model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
|
2679 |
embeddings = model.encode(sentences)
|
2680 |
print(cos_sim(embeddings[0], embeddings[1]))
|
2681 |
```
|
|
|
2690 |
|
2691 |
### Training Procedure
|
2692 |
|
2693 |
+
To enable the backbone model to support a context length of 8192, we adopted a multi-stage training strategy.
|
2694 |
+
The model first undergoes preliminary MLM pre-training on shorter lengths.
|
2695 |
+
And then, we resample the data, reducing the proportion of short texts, and continue the MLM pre-training.
|
2696 |
+
|
2697 |
+
The entire training process is as follows:
|
2698 |
- MLM-512: lr 2e-4, mlm_probability 0.3, batch_size 4096, num_steps 300000, rope_base 10000
|
2699 |
- MLM-2048: lr 5e-5, mlm_probability 0.3, batch_size 4096, num_steps 30000, rope_base 10000
|
2700 |
- MLM-8192: lr 5e-5, mlm_probability 0.3, batch_size 1024, num_steps 30000, rope_base 160000
|
|
|
2707 |
|
2708 |
### MTEB
|
2709 |
|
2710 |
+
The results of other models are retrieved from [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
|
2711 |
+
|
2712 |
+
The gte evaluation setting: `mteb==1.2.0, fp16 auto mix precision, max_length=8192`, and set ntk scaling factor to 2 (equivalent to rope_base * 2).
|
2713 |
|
2714 |
| Model Name | Param Size (M) | Dimension | Sequence Length | Average (56) | Class. (12) | Clust. (11) | Pair Class. (3) | Reran. (4) | Retr. (15) | STS (10) | Summ. (1) |
|
2715 |
|:----:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
|
2741 |
|
2742 |
**APA:**
|
2743 |
|
2744 |
+
[More Information Needed]
|
modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
}
|
14 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 8192,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|