import requests import torch from PIL import Image from transformers import MllamaForConditionalGeneration, AutoProcessor class EndpointHandler: def __init__(self, model_dir): # Initialize the model and processor from the directory model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct" self.model = MllamaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto" ) self.processor = AutoProcessor.from_pretrained(model_id) def process(self, inputs): """ Process the input data and return the output. Expecting inputs in the form of a dictionary containing 'image_url' and 'prompt'. """ image_url = inputs.get("image_url") prompt = inputs.get("prompt", "If I had to write a haiku for this one, it would be:") # Process the image image = Image.open(requests.get(image_url, stream=True).raw) # Generate response messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": prompt} ]} ] input_text = self.processor.apply_chat_template(messages, add_generation_prompt=True) model_inputs = self.processor(image, input_text, return_tensors="pt").to(self.model.device) output = self.model.generate(**model_inputs, max_new_tokens=30) # Return the output as a string return self.processor.decode(output[0])