from typing import Dict, List, Any
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

class EndpointHandler():
    def __init__(self, path="", vision_model="obsidian3b"):
        self.model = LlavaForConditionalGeneration.from_pretrained(
        "NousResearch/Obsidian-3B-V0.5",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        ).to("cuda" if torch.is_cuda_available() else "cpu")
        self.processor = AutoProcessor.from_pretrained("NousResearch/Obsidian-3B-V0.5")
            

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        data args:
            inputs (:obj: `str`)
            image (:obj: `Image`)
        Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """
        # get inputs
        inputs = data.pop("inputs", "")
        image = data.pop("image", None)
        
        inputs = self.processor(inputs, image, return_tensors="pt")
        res = self.model.generate(**inputs, do_sample=False, max_new_tokens=4096)
        return self.processor.decode(res[0], skip_special_tokens=True)

        #if image:
            # perform image classification using Obsidian 3b vision
            #image_features = self.vision.encode_image(image)
            #image_embedding = self.vision.extract_feature(image_features)
            #image_caption = self.vision.generate_caption(image_embedding)

            # combine text and image captions
            #combined_captions = [inputs, image_caption]

            # run text classification on combined captions
            #prediction = self.pipeline(combined_captions, temperature=0.33, num_beams=5, stop=[], do_sample=True)

            #return prediction
        

        #else:
            # run text classification on plain text input
        #    prediction = self.pipeline(inputs, temperature=0.33, num_beams=5, stop=[], do_sample=True)

        #    return prediction