Kaguya-19 commited on
Commit
1023e8b
1 Parent(s): 5d2fd73

fit for Sentence Transformer

Browse files
Files changed (2) hide show
  1. README.md +92 -23
  2. config.json +1 -1
README.md CHANGED
@@ -85,40 +85,53 @@ flash-attn>2.3.5
85
 
86
  ### 示例脚本 Demo
87
 
 
 
88
  ```python
89
- from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
90
  import torch
91
  import numpy as np
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  model_name = "openbmb/MiniCPM-Reranker"
94
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
95
  tokenizer.padding_side = "right"
 
96
  model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True,attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
97
  model.eval()
98
- max_len_q, max_len_d = 512, 512
99
-
100
- def tokenize_our(query,doc):
101
- input_id_query = tokenizer.encode(query, add_special_tokens=False, max_length=max_len_q, truncation=True)
102
- input_id_doc = tokenizer.encode(doc, add_special_tokens=False, max_length=max_len_d, truncation=True)
103
- pad_input = {"input_ids": [tokenizer.bos_token_id] + input_id_query + [tokenizer.eos_token_id] + input_id_doc}
104
- return tokenizer.pad(
105
- pad_input,
106
- padding="max_length",
107
- max_length=max_len_q + max_len_d + 2,
108
- return_tensors="pt",
109
- )
110
 
111
  @torch.no_grad()
112
  def rerank(input_query, input_docs):
113
- tokenized_inputs = [tokenize_our(input_query, input_doc).to("cuda") for input_doc in input_docs]
114
- input_ids = {
115
- "input_ids": [tokenized_input["input_ids"] for tokenized_input in tokenized_inputs],
116
- "attention_mask": [tokenized_input["attention_mask"] for tokenized_input in tokenized_inputs]
117
- }
118
-
119
- for k in input_ids:
120
- input_ids[k] = torch.stack(input_ids[k]).to("cuda")
121
- outputs = model(**input_ids)
122
  score = outputs.logits
123
  return score.float().detach().cpu().numpy()
124
 
@@ -136,6 +149,62 @@ for i in range(len(queries)):
136
  print(np.array(scores)) # [[[-4.7460938][-8.8515625]]]
137
  ```
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  ## 实验结果 Evaluation Results
140
 
141
  ### 中文与英文重排序结果 CN/EN Re-ranking Results
 
85
 
86
  ### 示例脚本 Demo
87
 
88
+ #### Huggingface Transformers
89
+
90
  ```python
91
+ from transformers import AutoModel, LlamaTokenizer, AutoModelForSequenceClassification
92
  import torch
93
  import numpy as np
94
 
95
+ # from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
96
+ class MiniCPMRerankerLLamaTokenizer(LlamaTokenizer):
97
+ def build_inputs_with_special_tokens(
98
+ self, token_ids_0, token_ids_1 = None
99
+ ):
100
+ """
101
+ - single sequence: `<s> X </s>`
102
+ - pair of sequences: `<s> A </s> B`
103
+
104
+ Args:
105
+ token_ids_0 (`List[int]`):
106
+ List of IDs to which the special tokens will be added.
107
+ token_ids_1 (`List[int]`, *optional*):
108
+ Optional second list of IDs for sequence pairs.
109
+
110
+ Returns:
111
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
112
+ """
113
+
114
+ if token_ids_1 is None:
115
+ return super().build_inputs_with_special_tokens(token_ids_0)
116
+ bos = [self.bos_token_id]
117
+ sep = [self.eos_token_id]
118
+ return bos + token_ids_0 + sep + token_ids_1
119
+
120
  model_name = "openbmb/MiniCPM-Reranker"
121
+ tokenizer = MiniCPMRerankerLLamaTokenizer.from_pretrained(model_name, trust_remote_code=True)
122
  tokenizer.padding_side = "right"
123
+
124
  model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True,attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
125
  model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  @torch.no_grad()
128
  def rerank(input_query, input_docs):
129
+ tokenized_inputs = tokenizer([[input_query, input_doc] for input_doc in input_docs], return_tensors="pt", padding=True, truncation=True, max_length=1024)
130
+
131
+ for k in tokenized_inputs:
132
+ tokenized_inputs [k] = tokenized_inputs[k].to("cuda")
133
+
134
+ outputs = model(**tokenized_inputs)
 
 
 
135
  score = outputs.logits
136
  return score.float().detach().cpu().numpy()
137
 
 
149
  print(np.array(scores)) # [[[-4.7460938][-8.8515625]]]
150
  ```
151
 
152
+ #### Sentence Transformer
153
+
154
+ ```python
155
+ from sentence_transformers import CrossEncoder
156
+ from transformers import LlamaTokenizer
157
+ import torch
158
+
159
+ # from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
160
+ class MiniCPMRerankerLLamaTokenizer(LlamaTokenizer):
161
+ def build_inputs_with_special_tokens(
162
+ self, token_ids_0, token_ids_1 = None
163
+ ):
164
+ """
165
+ - single sequence: `<s> X </s>`
166
+ - pair of sequences: `<s> A </s> B`
167
+
168
+ Args:
169
+ token_ids_0 (`List[int]`):
170
+ List of IDs to which the special tokens will be added.
171
+ token_ids_1 (`List[int]`, *optional*):
172
+ Optional second list of IDs for sequence pairs.
173
+
174
+ Returns:
175
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
176
+ """
177
+
178
+ if token_ids_1 is None:
179
+ return super().build_inputs_with_special_tokens(token_ids_0)
180
+ bos = [self.bos_token_id]
181
+ sep = [self.eos_token_id]
182
+ return bos + token_ids_0 + sep + token_ids_1
183
+
184
+ model_name = "openbmb/MiniCPM-Reranker"
185
+ model = CrossEncoder(model_name,max_length=1024,trust_remote_code=True, automodel_args={"attn_implementation":"flash_attention_2","torch_dtype": torch.float16})
186
+ model.tokenizer = MiniCPMRerankerLLamaTokenizer.from_pretrained(model_name, trust_remote_code=True)
187
+ model.tokenizer.padding_side = "right"
188
+
189
+ query = "中国的首都是哪里?"
190
+ passages = [["beijing", "shanghai"]]
191
+
192
+ INSTRUCTION = "Query: "
193
+ query = INSTRUCTION + query
194
+
195
+ sentence_pairs = [[query, doc] for doc in passages]
196
+
197
+ scores = model.predict(sentence_pairs, convert_to_tensor=True).tolist()
198
+ rankings = model.rank(query, passages, return_documents=True, convert_to_tensor=True)
199
+
200
+ print(scores) # [0.0087432861328125, 0.00020503997802734375]
201
+ for ranking in rankings:
202
+ print(f"Score: {ranking['score']:.4f}, Corpus: {ranking['text']}")
203
+
204
+ # ID: 0, Score: 0.0087, Text: beijing
205
+ # ID: 1, Score: 0.0002, Text: shanghai
206
+ ```
207
+
208
  ## 实验结果 Evaluation Results
209
 
210
  ### 中文与英文重排序结果 CN/EN Re-ranking Results
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "openbmb/RankCPM-R",
3
  "architectures": [
4
  "MiniCPM"
5
  ],
 
1
  {
2
+ "_name_or_path": "openbmb/MiniCPM-Reranker",
3
  "architectures": [
4
  "MiniCPM"
5
  ],