from typing import Dict, List, Any import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification,pipeline from transformers import pipeline import deepspeed class EndpointHandler(): def __init__(self, path=""): # load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(path) model = AutoModelForSequenceClassification.from_pretrained(path) # init deepspeed inference engine ds_model = deepspeed.init_inference( model=model, # Transformers models mp_size=1, # Number of GPU dtype=torch.half, # dtype of the weights (fp16) # injection_policy={"BertLayer" : HFBertLayerPolicy}, # replace BertLayer with DS HFBertLayerPolicy replace_method="auto", # Lets DS autmatically identify the layer to replace replace_with_kernel_inject=True, # replace the model with the kernel injector ) # create acclerated pipeline self.pipeline = pipeline("text-classification", model=ds_model, tokenizer=tokenizer, device=0) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str`) date (:obj: `str`) Return: A :obj:`list` | `dict`: will be serialized and returned """ inputs = data.pop("inputs", data) parameters = data.pop("parameters", None) # pass inputs with all kwargs in data if parameters is not None: prediction = self.pipeline(inputs, **parameters) else: prediction = self.pipeline(inputs) # postprocess the prediction return prediction