blip2-image-to-text / handler.py
thoth-AI's picture
Updated handler.py
b1e4650
from typing import Dict, List, Any
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
from io import BytesIO
import torch, re, base64
class EndpointHandler:
def __init__(self, path=""):
# load the optimized model
self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map="auto")
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
- "caption": A string corresponding to the generated caption.
"""
# parameters = data.pop("parameters", {})
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = base64.b64decode(re.sub('^data:image/.+;base64,', '', data['inputs']))
raw_images = Image.open(BytesIO(inputs))
processed_image = self.processor(images=raw_images, return_tensors="pt").to(device)
out = self.model.generate(**processed_image)
captions = self.processor.decode(out[0], skip_special_tokens=True)
# postprocess the prediction
return {"captions": captions}
EndpointHandler()