import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

class EndpointHandler:
    def __init__(self, model_dir):
        # Initialize the model and processor from the directory
        model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct"
        self.model = MllamaForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
        self.processor = AutoProcessor.from_pretrained(model_id)
    
    def process(self, inputs):
        """
        Process the input data and return the output.
        Expecting inputs in the form of a dictionary containing 'image_url' and 'prompt'.
        """
        image_url = inputs.get("image_url")
        prompt = inputs.get("prompt", "If I had to write a haiku for this one, it would be:") 
        
        # Process the image
        image = Image.open(requests.get(image_url, stream=True).raw)
        
        # Generate response
        messages = [
            {"role": "user", "content": [
                {"type": "image"},
                {"type": "text", "text": prompt}
            ]}
        ]
        input_text = self.processor.apply_chat_template(messages, add_generation_prompt=True)
        model_inputs = self.processor(image, input_text, return_tensors="pt").to(self.model.device)
        output = self.model.generate(**model_inputs, max_new_tokens=30)
        
        # Return the output as a string
        return self.processor.decode(output[0])