请问怎么使用HuggingFaceEmbeddings 来进行embedding
#2
by
evan209
- opened
配置该怎么配置,有没有使用方法
可以参考以下代码
from pydantic import BaseModel, Extra, Field
from typing import Any, Dict, List, Optional
from langchain.embeddings.base import Embeddings
import torch
from tqdm import tqdm
import sys
import os
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(file)), '..'))
class HuggingFaceLLMEmbeddings(BaseModel, Embeddings):
client: Any
tokenizer: Any
client_rerank: Any
model_name: str = ''
batch_size: int = 1
trunc_max_length: int = 512
mean_token_size: int = 0
cache_folder: Optional[str] = None
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
def __init__(self, model_path='', batch_size=1, trunc_length=512, **kwargs: Any):
super().__init__(**kwargs)
self.mean_token_size = 0
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
embedding_model_name = model_path
self.trunc_max_length = trunc_length
self.batch_size = batch_size
with torch.no_grad():
self.tokenizer = AutoTokenizer.from_pretrained(embedding_model_name, trust_remote_code=True)
self.client = AutoModel.from_pretrained(embedding_model_name)
if torch.cuda.is_available():
available_gpus = [i for i in range(torch.cuda.device_count())]
if len(available_gpus) == 1:
self.client = self.client.cuda()
self.client_rerank = self.client_rerank.cuda()
else:
self.client = torch.nn.DataParallel(self.client, device_ids=available_gpus).cuda()
self.client_rerank = torch.nn.DataParallel(self.client_rerank, device_ids=available_gpus).cuda()
self.client.eval()
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
def embed_documents(self, texts: List[str], use_tqdm=True, use_instruction=False) -> List[List[float]]:
embeddings_all = []
with torch.no_grad():
if use_instruction:
instruction = "为这个句子生成表示以用于检索相关文章:"
else:
instruction = ''
texts = list(map(lambda x: instruction + x.replace("\n", " "), texts))
tbar = tqdm(range(0, len(texts), self.batch_size)) if use_tqdm else range(0, len(texts), self.batch_size)
for i in tbar:
texts_ = self.tokenizer(texts[i: min(i + self.batch_size, len(texts))], truncation=True, max_length=self.trunc_max_length, padding=True, return_tensors='pt')
if torch.cuda.is_available():
texts_ = texts_.to('cuda')
model_output = self.client(**texts_)
model_output = model_output[0][:, 0]
embeddings = model_output
embeddings /= embeddings.norm(dim=-1, keepdim=True)
embeddings_all.append(embeddings.cpu())
embeddings_all = torch.cat(embeddings_all, dim=0)
return embeddings_all.tolist()
def embed_query(self, texts: List[str], use_tqdm= False) -> List[List[float]]:
#对于s2p任务,可以设置use_instruction=True,对于s2s任务, 可以设置use_instruction=False
embedding = self.embed_documents(texts, use_tqdm=use_tqdm, use_instruction=False)
return embedding
非常感谢