请问怎么使用HuggingFaceEmbeddings 来进行embedding

#2
by evan209 - opened

配置该怎么配置,有没有使用方法

可以参考以下代码

from pydantic import BaseModel, Extra, Field
from typing import Any, Dict, List, Optional
from langchain.embeddings.base import Embeddings
import torch
from tqdm import tqdm
import sys
import os
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(file)), '..'))

class HuggingFaceLLMEmbeddings(BaseModel, Embeddings):
client: Any
tokenizer: Any
client_rerank: Any
model_name: str = ''
batch_size: int = 1
trunc_max_length: int = 512
mean_token_size: int = 0
cache_folder: Optional[str] = None
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)

def __init__(self, model_path='', batch_size=1, trunc_length=512,  **kwargs: Any):
    super().__init__(**kwargs)
    self.mean_token_size = 0
    from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
    embedding_model_name = model_path
    self.trunc_max_length = trunc_length
    self.batch_size = batch_size

    with torch.no_grad():
        self.tokenizer = AutoTokenizer.from_pretrained(embedding_model_name, trust_remote_code=True)
        
        self.client = AutoModel.from_pretrained(embedding_model_name)
     
        if torch.cuda.is_available():
            available_gpus = [i for i in range(torch.cuda.device_count())]
            if len(available_gpus) == 1:
                self.client = self.client.cuda()
                self.client_rerank = self.client_rerank.cuda()
            else:
                self.client = torch.nn.DataParallel(self.client, device_ids=available_gpus).cuda()
                self.client_rerank = torch.nn.DataParallel(self.client_rerank, device_ids=available_gpus).cuda()
        self.client.eval()
      

class Config:
    """Configuration for this pydantic object."""
    extra = Extra.forbid

def embed_documents(self, texts: List[str], use_tqdm=True, use_instruction=False) -> List[List[float]]:
    embeddings_all = []
    with torch.no_grad():
        if use_instruction:
            instruction = "为这个句子生成表示以用于检索相关文章:"
        else:
            instruction = ''
        texts = list(map(lambda x:  instruction + x.replace("\n", " "), texts))
        tbar = tqdm(range(0, len(texts), self.batch_size)) if use_tqdm else range(0, len(texts), self.batch_size)
        for i in tbar:
            texts_ = self.tokenizer(texts[i: min(i + self.batch_size, len(texts))], truncation=True, max_length=self.trunc_max_length, padding=True, return_tensors='pt')
            if torch.cuda.is_available():
                texts_ = texts_.to('cuda')
            model_output = self.client(**texts_)
            model_output = model_output[0][:, 0]
            embeddings = model_output
            embeddings /= embeddings.norm(dim=-1, keepdim=True)
            embeddings_all.append(embeddings.cpu())
    embeddings_all = torch.cat(embeddings_all, dim=0)
    return embeddings_all.tolist()

def embed_query(self, texts: List[str], use_tqdm= False) -> List[List[float]]:
    #对于s2p任务,可以设置use_instruction=True,对于s2s任务, 可以设置use_instruction=False
    embedding = self.embed_documents(texts, use_tqdm=use_tqdm, use_instruction=False)
    return embedding

非常感谢

Sign up or log in to comment