blip2-flan-t5-xl / handler.py
sheraz179's picture
Update handler.py
9b5938b
raw
history blame contribute delete
No virus
1.81 kB
from typing import Dict, List, Any
from PIL import Image
import torch
import os, base64
from io import BytesIO
from transformers import Blip2ForConditionalGeneration, Blip2Processor
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class EndpointHandler():
def __init__(self, path=""):
# load the optimized model
self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
self.model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-flan-t5-xl"
).to(device)
self.model.eval()
self.model = self.model.to(device)
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
- "caption": A string corresponding to the generated caption.
"""
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", {})
raw_images = [Image.open(BytesIO(base64.b64decode(_img))) for _img in inputs]
processed_image = self.processor(images=raw_images, return_tensors="pt")
processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
processed_image = {**processed_image, **parameters}
with torch.no_grad():
out = self.model.generate(
**processed_image
)
captions = self.processor.batch_decode(out, skip_special_tokens=True)
# postprocess the prediction
return {"captions": captions}