Patch Sentence Transformers integration

by tomaarsen HF staff - opened Sep 30

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+40

-9

Files changed (5) hide show

{1_Pool → 1_Pooling}/config.json +1 -1
README.md +5 -7
config_sentence_transformers.json +25 -1
modules.json +6 -0
sentence_bert_config.json +3 -0

{1_Pool → 1_Pooling}/config.json RENAMED Viewed

@@ -6,5 +6,5 @@
     "pooling_mode_mean_sqrt_len_tokens": false,
     "pooling_mode_weightedmean_tokens": false,
     "pooling_mode_lasttoken": false,
-    "include_prompt": false
   }

     "pooling_mode_mean_sqrt_len_tokens": false,
     "pooling_mode_weightedmean_tokens": false,
     "pooling_mode_lasttoken": false,
+    "include_prompt": true
   }

README.md CHANGED Viewed

@@ -262,6 +262,7 @@ model-index:
 pipeline_tag: feature-extraction
 tags:
 - mteb
 library_name: transformers
 ---
 ## MiniCPM-Embedding
@@ -401,21 +402,18 @@ import torch
 from sentence_transformers import SentenceTransformer
 model_name = "openbmb/MiniCPM-Embedding"
-model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={"attn_implementation":"flash_attention_2", "torch_dtype":torch.float16})
-model.max_seq_length = 512
-model.tokenizer.padding_side="right"
 queries = ["中国的首都是哪里？"]
 passages = ["beijing", "shanghai"]
 INSTRUCTION = "Query: "
-embeddings_query = model.encode(queries, prompt=INSTRUCTION, normalize_embeddings=True)
-embeddings_doc = model.encode(passages, normalize_embeddings=True)
 scores = (embeddings_query @ embeddings_doc.T)
-print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]
 ```
 ## 实验结果 Evaluation Results

 pipeline_tag: feature-extraction
 tags:
 - mteb
+- sentence-transformers
 library_name: transformers
 ---
 ## MiniCPM-Embedding
 from sentence_transformers import SentenceTransformer
 model_name = "openbmb/MiniCPM-Embedding"
+model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={"attn_implementation": "flash_attention_2", "torch_dtype": torch.float16})
 queries = ["中国的首都是哪里？"]
 passages = ["beijing", "shanghai"]
 INSTRUCTION = "Query: "
+embeddings_query = model.encode(queries, prompt=INSTRUCTION)
+embeddings_doc = model.encode(passages)
 scores = (embeddings_query @ embeddings_doc.T)
+print(scores.tolist())  # [[0.35365450382232666, 0.18592746555805206]]
 ```
 ## 实验结果 Evaluation Results

config_sentence_transformers.json CHANGED Viewed

@@ -4,6 +4,30 @@
       "transformers": "4.37.2",
       "pytorch": "2.0.1+cu121"
     },
-    "prompts": {},
     "default_prompt_name": null
   }

       "transformers": "4.37.2",
       "pytorch": "2.0.1+cu121"
     },
+    "prompts": {
+      "fiqa": "Instruction: Given a financial question, retrieve user replies that best answer the question. Query: ",
+      "dbpedia": "Instruction: Given a query, retrieve relevant entity descriptions from DBPedia. Query: ",
+      "CmedqaRetrieval": "Instruction: 为这个医疗问题检索相关回答。 Query: ",
+      "nfcorpus": "Instruction: Given a question, retrieve relevant documents that best answer the question. Query: ",
+      "touche2020": "Instruction: Given a question, retrieve detailed and persuasive arguments that answer the question. Query: ",
+      "CovidRetrieval": "Instruction: 为这个问题检索相关政策回答。 Query: ",
+      "scifact": "Instruction: Given a scientific claim, retrieve documents that support or refute the claim. Query: ",
+      "scidocs": "Instruction: Given a scientific paper title, retrieve paper abstracts that are cited by the given paper. Query: ",
+      "nq": "Instruction: Given a question, retrieve Wikipedia passages that answer the question. Query: ",
+      "T2Retrieval": "Instruction: 为这个问题检索相关段落。 Query: ",
+      "VideoRetrieval": "Instruction: 为这个电影标题检索相关段落。 Query: ",
+      "DuRetrieval": "Instruction: 为这个问题检索相关百度知道回答。 Query: ",
+      "MMarcoRetrieval": "Instruction: 为这个查询检索相关段落。 Query: ",
+      "hotpotqa": "Instruction: Given a multi-hop question, retrieve documents that can help answer the question. Query: ",
+      "quora": "Instruction: Given a question, retrieve questions that are semantically equivalent to the given question. Query: ",
+      "climate-fever": "Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: ",
+      "arguana": "Instruction: Given a claim, find documents that refute the claim. Query: ",
+      "fever": "Instruction: Given a claim, retrieve documents that support or refute the claim. Query: ",
+      "trec-covid": "Instruction: Given a query on COVID-19, retrieve documents that answer the query. Query: ",
+      "msmarco": "Instruction: Given a web search query, retrieve relevant passages that answer the query. Query: ",
+      "EcomRetrieval": "Instruction: 为这个查询检索相关商品标题。 Query: ",
+      "MedicalRetrieval": "Instruction: 为这个医学问题检索相关回答。 Query: ",
+      "CAQstack":"Instruction: Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question. Query: "
+    },
     "default_prompt_name": null
   }

modules.json CHANGED Viewed

@@ -10,5 +10,11 @@
       "name": "1",
       "path": "1_Pooling",
       "type": "sentence_transformers.models.Pooling"
     }
   ]

       "name": "1",
       "path": "1_Pooling",
       "type": "sentence_transformers.models.Pooling"
+    },
+    {
+      "idx": 2,
+      "name": "2",
+      "path": "2_Normalize",
+      "type": "sentence_transformers.models.Normalize"
     }
   ]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "max_seq_length": 512
+}