File size: 1,541 Bytes

db9328f
 
 
 
 
 
b1e4650
d6460d1
db9328f
 
 
 
 
 
 
d6460d1
db9328f
 
 
 
 
 
 
 
 
 
 
d6460d1
 
d576d35
b1e4650
 
d576d35
 
db9328f
 
f49e3ec
 
c99069e
60be62e
f49e3ec
db9328f
d576d35

from typing import Dict, List, Any

from transformers import Blip2Processor, Blip2ForConditionalGeneration

from PIL import Image
from io import BytesIO
import torch, re, base64


class EndpointHandler:
    def __init__(self, path=""):
        # load the optimized model

        self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") 
        self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map="auto")



    def __call__(self, data: Any) -> Dict[str, Any]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
                - "caption": A string corresponding to the generated caption.
        """
        # parameters = data.pop("parameters", {})
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


        inputs = base64.b64decode(re.sub('^data:image/.+;base64,', '', data['inputs']))

        raw_images = Image.open(BytesIO(inputs))
                                     
        processed_image = self.processor(images=raw_images, return_tensors="pt").to(device)

        out = self.model.generate(**processed_image)
       
        captions = self.processor.decode(out[0], skip_special_tokens=True)

        # postprocess the prediction
        return {"captions": captions}
    

EndpointHandler()