from typing import Any, Dict, List from transformers import Idefics2Processor, Idefics2Model class EndpointHandler: def __init__(self, path=""): # Preload all the elements you are going to need at inference. self.processor = Idefics2Processor.from_pretrained(path) self.model = Idefics2Model.from_pretrained(path) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str` | `PIL.Image` | `np.array`) kwargs Return: A :obj:`list` | `dict`: will be serialized and returned """ image = data.pop("inputs", data) # process image output = self.processor(images=image, return_tensors="pt") generated_ids = self.model.generate(**output) # run prediction generated_text = self.processor.batch_decode( generated_ids, skip_special_tokens=True ) # decode output print(generated_text)